WIP: Functions: new local allocator for better memory reuse and performance #104630
|
@ -5,6 +5,7 @@
|
|||
|
||||
#include "BLI_allocator.hh"
|
||||
#include "BLI_asan.h"
|
||||
#include "BLI_enumerable_thread_specific.hh"
|
||||
#include "BLI_map.hh"
|
||||
#include "BLI_math_bits.h"
|
||||
#include "BLI_stack.hh"
|
||||
|
@ -13,14 +14,10 @@
|
|||
|
||||
namespace blender {
|
||||
|
||||
class LocalPoolScope {
|
||||
};
|
||||
|
||||
template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, NonMovable {
|
||||
private:
|
||||
static constexpr int64_t s_alignment = 64;
|
||||
|
||||
const LocalPoolScope &pool_scope_;
|
||||
Vector<MutableSpan<std::byte>> owned_buffers_;
|
||||
|
||||
struct BufferStack {
|
||||
|
@ -34,7 +31,7 @@ template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, N
|
|||
BLI_NO_UNIQUE_ADDRESS Allocator allocator_;
|
||||
|
||||
public:
|
||||
LocalPool(const LocalPoolScope &pool_scope) : pool_scope_(pool_scope)
|
||||
LocalPool()
|
||||
{
|
||||
for (const int64_t i : IndexRange(small_stacks_.size())) {
|
||||
small_stacks_[i].element_size = 8 * (i + 1);
|
||||
|
@ -129,4 +126,20 @@ template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, N
|
|||
}
|
||||
};
|
||||
|
||||
class LocalMemoryPools {
|
||||
private:
|
||||
threading::EnumerableThreadSpecific<LocalPool<>> pool_by_thread_;
|
||||
|
||||
public:
|
||||
LocalPool<> &local()
|
||||
{
|
||||
return pool_by_thread_.local();
|
||||
}
|
||||
};
|
||||
|
||||
struct Pools {
|
||||
LocalMemoryPools *pools = nullptr;
|
||||
LocalPool<> *local = nullptr;
|
||||
};
|
||||
|
||||
} // namespace blender
|
||||
|
|
|
@ -9,8 +9,7 @@ namespace blender::tests {
|
|||
|
||||
TEST(local_pool, Test)
|
||||
{
|
||||
LocalPoolScope pool_scope;
|
||||
LocalPool pool(pool_scope);
|
||||
LocalPool pool;
|
||||
|
||||
std::cout << pool.allocate(30000, 8) << "\n";
|
||||
}
|
||||
|
|
|
@ -100,7 +100,7 @@ struct Context {
|
|||
*/
|
||||
UserData *user_data;
|
||||
|
||||
LocalPool<> *local_pool = nullptr;
|
||||
Pools pools;
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -279,12 +279,12 @@ class LazyFunction {
|
|||
* Allocates storage for this function. The storage will be passed to every call to #execute.
|
||||
* If the function does not keep track of any state, this does not have to be implemented.
|
||||
*/
|
||||
virtual void *init_storage(LocalPool<> &allocator) const;
|
||||
virtual void *init_storage(Pools &pools) const;
|
||||
|
||||
/**
|
||||
* Destruct the storage created in #init_storage.
|
||||
*/
|
||||
virtual void destruct_storage(void *storage, LocalPool<> &allocator) const;
|
||||
virtual void destruct_storage(void *storage, Pools &pools) const;
|
||||
|
||||
/**
|
||||
* Calls `fn` with the input indices that the given `output_index` may depend on. By default
|
||||
|
|
|
@ -85,15 +85,16 @@ inline void execute_lazy_function_eagerly_impl(
|
|||
...);
|
||||
output_usages.fill(ValueUsage::Used);
|
||||
set_outputs.fill(false);
|
||||
LocalPoolScope local_pool_scope;
|
||||
LocalPool<> allocator(local_pool_scope);
|
||||
LocalMemoryPools local_pools;
|
||||
Pools pools{&local_pools, &local_pools.local()};
|
||||
Context context;
|
||||
context.user_data = user_data;
|
||||
context.storage = fn.init_storage(allocator);
|
||||
context.storage = fn.init_storage(pools);
|
||||
context.pools = pools;
|
||||
BasicParams params{
|
||||
fn, input_pointers, output_pointers, input_usages, output_usages, set_outputs};
|
||||
fn.execute(params, context);
|
||||
fn.destruct_storage(context.storage, allocator);
|
||||
fn.destruct_storage(context.storage, pools);
|
||||
|
||||
/* Make sure all outputs have been computed. */
|
||||
BLI_assert(!Span<bool>(set_outputs).contains(false));
|
||||
|
|
|
@ -88,8 +88,8 @@ class GraphExecutor : public LazyFunction {
|
|||
const Logger *logger,
|
||||
const SideEffectProvider *side_effect_provider);
|
||||
|
||||
void *init_storage(LocalPool<> &allocator) const override;
|
||||
void destruct_storage(void *storage, LocalPool<> &allocator) const override;
|
||||
void *init_storage(Pools &pools) const override;
|
||||
void destruct_storage(void *storage, Pools &pools) const override;
|
||||
|
||||
private:
|
||||
void execute_impl(Params ¶ms, const Context &context) const override;
|
||||
|
|
|
@ -25,12 +25,12 @@ std::string LazyFunction::output_name(int index) const
|
|||
return outputs_[index].debug_name;
|
||||
}
|
||||
|
||||
void *LazyFunction::init_storage(LocalPool<> & /*allocator*/) const
|
||||
void *LazyFunction::init_storage(Pools & /*pools*/) const
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void LazyFunction::destruct_storage(void *storage, LocalPool<> & /*allocator*/) const
|
||||
void LazyFunction::destruct_storage(void *storage, Pools & /*pools*/) const
|
||||
{
|
||||
BLI_assert(storage == nullptr);
|
||||
UNUSED_VARS_NDEBUG(storage);
|
||||
|
|
|
@ -247,16 +247,7 @@ class Executor {
|
|||
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
|
||||
std::thread::id current_main_thread_;
|
||||
#endif
|
||||
LocalPoolScope local_pool_scope_;
|
||||
struct ThreadLocalData {
|
||||
LocalPool<> local_pool;
|
||||
|
||||
ThreadLocalData(const LocalPoolScope &local_pool_scope) : local_pool(local_pool_scope)
|
||||
{
|
||||
}
|
||||
};
|
||||
std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
|
||||
LocalPool<> main_allocator_;
|
||||
/**
|
||||
* Set to false when the first execution ends.
|
||||
*/
|
||||
|
@ -265,14 +256,13 @@ class Executor {
|
|||
friend GraphExecutorLFParams;
|
||||
|
||||
public:
|
||||
Executor(const GraphExecutor &self)
|
||||
: self_(self), loaded_inputs_(self.graph_inputs_.size()), main_allocator_(local_pool_scope_)
|
||||
Executor(const GraphExecutor &self) : self_(self), loaded_inputs_(self.graph_inputs_.size())
|
||||
{
|
||||
/* The indices are necessary, because they are used as keys in #node_states_. */
|
||||
BLI_assert(self_.graph_.node_indices_are_valid());
|
||||
}
|
||||
|
||||
void destruct_self(LocalPool<> & /*parent_allocator*/)
|
||||
void destruct_self(Pools &pools)
|
||||
{
|
||||
if (TaskPool *task_pool = task_pool_.load()) {
|
||||
BLI_task_pool_free(task_pool);
|
||||
|
@ -281,7 +271,7 @@ class Executor {
|
|||
for (const int node_index : range) {
|
||||
const Node &node = *self_.graph_.nodes()[node_index];
|
||||
NodeState &node_state = *node_states_[node_index];
|
||||
this->destruct_node_state(node, node_state, this->get_main_or_local_allocator());
|
||||
this->destruct_node_state(node, node_state, pools);
|
||||
}
|
||||
});
|
||||
this->~Executor();
|
||||
|
@ -329,7 +319,7 @@ class Executor {
|
|||
}
|
||||
}
|
||||
|
||||
this->initialize_static_value_usages(side_effect_nodes, this->get_main_or_local_allocator());
|
||||
this->initialize_static_value_usages(side_effect_nodes, this->get_local_allocator());
|
||||
this->schedule_side_effect_nodes(side_effect_nodes, current_task);
|
||||
}
|
||||
|
||||
|
@ -349,25 +339,16 @@ class Executor {
|
|||
Span<const Node *> nodes = self_.graph_.nodes();
|
||||
node_states_.reinitialize(nodes.size());
|
||||
|
||||
auto construct_node_range = [&](const IndexRange range, LocalPool<> &allocator) {
|
||||
/* Construct all node states in parallel. */
|
||||
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
|
||||
LocalPool<> &allocator = this->get_local_allocator();
|
||||
for (const int i : range) {
|
||||
const Node &node = *nodes[i];
|
||||
NodeState &node_state = *allocator.construct<NodeState>().release();
|
||||
node_states_[i] = &node_state;
|
||||
this->construct_initial_node_state(allocator, node, node_state);
|
||||
}
|
||||
};
|
||||
if (nodes.size() <= 256) {
|
||||
construct_node_range(nodes.index_range(), main_allocator_);
|
||||
}
|
||||
else {
|
||||
this->ensure_thread_locals();
|
||||
/* Construct all node states in parallel. */
|
||||
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
|
||||
LocalPool<> &allocator = thread_locals_->local().local_pool;
|
||||
construct_node_range(range, allocator);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void construct_initial_node_state(LocalPool<> &allocator,
|
||||
|
@ -381,18 +362,18 @@ class Executor {
|
|||
node_state.outputs = allocator.construct_array<OutputState>(node_outputs.size());
|
||||
}
|
||||
|
||||
void destruct_node_state(const Node &node, NodeState &node_state, LocalPool<> &allocator)
|
||||
void destruct_node_state(const Node &node, NodeState &node_state, Pools &pools)
|
||||
{
|
||||
if (node.is_function()) {
|
||||
const LazyFunction &fn = static_cast<const FunctionNode &>(node).function();
|
||||
if (node_state.storage != nullptr) {
|
||||
fn.destruct_storage(node_state.storage, allocator);
|
||||
fn.destruct_storage(node_state.storage, pools);
|
||||
}
|
||||
}
|
||||
for (const int i : node.inputs().index_range()) {
|
||||
InputState &input_state = node_state.inputs[i];
|
||||
const InputSocket &input_socket = node.input(i);
|
||||
this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
|
||||
this->destruct_input_value_if_exists(input_state, input_socket.type(), *pools.local);
|
||||
}
|
||||
std::destroy_at(&node_state);
|
||||
}
|
||||
|
@ -548,7 +529,7 @@ class Executor {
|
|||
|
||||
void forward_newly_provided_inputs(CurrentTask ¤t_task)
|
||||
{
|
||||
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = this->get_local_allocator();
|
||||
for (const int graph_input_index : self_.graph_inputs_.index_range()) {
|
||||
std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
|
||||
if (was_loaded.load()) {
|
||||
|
@ -602,7 +583,7 @@ class Executor {
|
|||
return;
|
||||
}
|
||||
this->forward_newly_provided_input(
|
||||
current_task, this->get_main_or_local_allocator(), graph_input_index, input_data);
|
||||
current_task, this->get_local_allocator(), graph_input_index, input_data);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -721,7 +702,7 @@ class Executor {
|
|||
void run_node_task(const FunctionNode &node, CurrentTask ¤t_task)
|
||||
{
|
||||
NodeState &node_state = *node_states_[node.index_in_graph()];
|
||||
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = this->get_local_allocator();
|
||||
const LazyFunction &fn = node.function();
|
||||
|
||||
bool node_needs_execution = false;
|
||||
|
@ -787,7 +768,8 @@ class Executor {
|
|||
if (node_needs_execution) {
|
||||
if (!node_state.storage_and_defaults_initialized) {
|
||||
/* Initialize storage. */
|
||||
node_state.storage = fn.init_storage(allocator);
|
||||
Pools pools{context_->pools.pools, &allocator};
|
||||
node_state.storage = fn.init_storage(pools);
|
||||
|
||||
/* Load unlinked inputs. */
|
||||
for (const int input_index : node.inputs().index_range()) {
|
||||
|
@ -899,7 +881,8 @@ class Executor {
|
|||
if (node_state.storage != nullptr) {
|
||||
if (node.is_function()) {
|
||||
const FunctionNode &fn_node = static_cast<const FunctionNode &>(node);
|
||||
fn_node.function().destruct_storage(node_state.storage, allocator);
|
||||
Pools pools{context_->pools.pools, &allocator};
|
||||
fn_node.function().destruct_storage(node_state.storage, pools);
|
||||
}
|
||||
node_state.storage = nullptr;
|
||||
}
|
||||
|
@ -926,7 +909,7 @@ class Executor {
|
|||
const int input_index,
|
||||
CurrentTask ¤t_task)
|
||||
{
|
||||
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = this->get_local_allocator();
|
||||
const InputSocket &input_socket = node.input(input_index);
|
||||
this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
|
||||
this->set_input_unused(locked_node, input_socket, allocator);
|
||||
|
@ -1001,7 +984,7 @@ class Executor {
|
|||
CurrentTask ¤t_task)
|
||||
{
|
||||
BLI_assert(value_to_forward.get() != nullptr);
|
||||
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = this->get_local_allocator();
|
||||
const CPPType &type = *value_to_forward.type();
|
||||
|
||||
if (self_.logger_ != nullptr) {
|
||||
|
@ -1115,24 +1098,10 @@ class Executor {
|
|||
if (BLI_system_thread_count() <= 1) {
|
||||
return false;
|
||||
}
|
||||
this->ensure_thread_locals();
|
||||
task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
|
||||
return true;
|
||||
}
|
||||
|
||||
void ensure_thread_locals()
|
||||
{
|
||||
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
|
||||
if (current_main_thread_ != std::this_thread::get_id()) {
|
||||
BLI_assert_unreachable();
|
||||
}
|
||||
#endif
|
||||
if (!thread_locals_) {
|
||||
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>(
|
||||
[scope = &local_pool_scope_]() { return ThreadLocalData{*scope}; });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow other threads to steal all the nodes that are currently scheduled on this thread.
|
||||
*/
|
||||
|
@ -1168,15 +1137,12 @@ class Executor {
|
|||
});
|
||||
}
|
||||
|
||||
LocalPool<> &get_main_or_local_allocator()
|
||||
LocalPool<> &get_local_allocator()
|
||||
{
|
||||
if (this->use_multi_threading()) {
|
||||
return thread_locals_->local().local_pool;
|
||||
return context_->pools.pools->local();
|
||||
}
|
||||
if (context_ != nullptr && context_->local_pool) {
|
||||
return *context_->local_pool;
|
||||
}
|
||||
return main_allocator_;
|
||||
return *context_->pools.local;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1225,7 +1191,7 @@ class GraphExecutorLFParams final : public Params {
|
|||
OutputState &output_state = node_state_.outputs[index];
|
||||
BLI_assert(!output_state.has_been_computed);
|
||||
if (output_state.value == nullptr) {
|
||||
LocalPool<> &allocator = executor_.get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = executor_.get_local_allocator();
|
||||
const CPPType &type = node_.output(index).type();
|
||||
output_state.value = allocator.allocate(type.size(), type.alignment());
|
||||
}
|
||||
|
@ -1282,7 +1248,7 @@ inline void Executor::execute_node(const FunctionNode &node,
|
|||
BLI_assert(context_ != nullptr);
|
||||
Context fn_context = *context_;
|
||||
fn_context.storage = node_state.storage;
|
||||
fn_context.local_pool = &allocator;
|
||||
fn_context.pools.local = &allocator;
|
||||
|
||||
if (self_.logger_ != nullptr) {
|
||||
self_.logger_->log_before_node_execute(node, node_params, fn_context);
|
||||
|
@ -1339,17 +1305,17 @@ void GraphExecutor::execute_impl(Params ¶ms, const Context &context) const
|
|||
executor.execute(params, context);
|
||||
}
|
||||
|
||||
void *GraphExecutor::init_storage(LocalPool<> &allocator) const
|
||||
void *GraphExecutor::init_storage(Pools &pools) const
|
||||
{
|
||||
Executor &executor = *allocator.construct<Executor>(*this).release();
|
||||
Executor &executor = *pools.local->construct<Executor>(*this).release();
|
||||
return &executor;
|
||||
}
|
||||
|
||||
void GraphExecutor::destruct_storage(void *storage, LocalPool<> &allocator) const
|
||||
void GraphExecutor::destruct_storage(void *storage, Pools &pools) const
|
||||
{
|
||||
Executor *executor = static_cast<Executor *>(storage);
|
||||
executor->destruct_self(allocator);
|
||||
allocator.deallocate(executor, sizeof(Executor), alignof(Executor));
|
||||
executor->destruct_self(pools);
|
||||
pools.local->deallocate(executor, sizeof(Executor), alignof(Executor));
|
||||
}
|
||||
|
||||
void GraphExecutorLogger::log_socket_value(const Socket &socket,
|
||||
|
|
|
@ -1163,8 +1163,12 @@ static GeometrySet compute_geometry(
|
|||
blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
|
||||
user_data.compute_context = &modifier_compute_context;
|
||||
|
||||
blender::LocalPoolScope local_pool_scope;
|
||||
blender::LocalPool<> allocator(local_pool_scope);
|
||||
blender::LocalMemoryPools local_pools;
|
||||
blender::Pools pools;
|
||||
pools.pools = &local_pools;
|
||||
pools.local = &local_pools.local();
|
||||
blender::LocalPool<> &allocator = *pools.local;
|
||||
|
||||
Vector<GMutablePointer> inputs_to_destruct;
|
||||
|
||||
int input_index;
|
||||
|
@ -1190,8 +1194,9 @@ static GeometrySet compute_geometry(
|
|||
}
|
||||
|
||||
lf::Context lf_context;
|
||||
lf_context.storage = graph_executor.init_storage(allocator);
|
||||
lf_context.storage = graph_executor.init_storage(pools);
|
||||
lf_context.user_data = &user_data;
|
||||
lf_context.pools = pools;
|
||||
lf::BasicParams lf_params{graph_executor,
|
||||
param_inputs,
|
||||
param_outputs,
|
||||
|
@ -1199,7 +1204,7 @@ static GeometrySet compute_geometry(
|
|||
param_output_usages,
|
||||
param_set_outputs};
|
||||
graph_executor.execute(lf_params, lf_context);
|
||||
graph_executor.destruct_storage(lf_context.storage, allocator);
|
||||
graph_executor.destruct_storage(lf_context.storage, pools);
|
||||
|
||||
for (GMutablePointer &ptr : inputs_to_destruct) {
|
||||
ptr.destruct();
|
||||
|
@ -1272,6 +1277,7 @@ static void modifyGeometry(ModifierData *md,
|
|||
const ModifierEvalContext *ctx,
|
||||
GeometrySet &geometry_set)
|
||||
{
|
||||
SCOPED_TIMER_AVERAGED(__func__);
|
||||
NodesModifierData *nmd = reinterpret_cast<NodesModifierData *>(md);
|
||||
if (nmd->node_group == nullptr) {
|
||||
return;
|
||||
|
|
|
@ -689,17 +689,17 @@ class LazyFunctionForGroupNode : public LazyFunction {
|
|||
graph_executor_->execute(params, group_context);
|
||||
}
|
||||
|
||||
void *init_storage(LocalPool<> &allocator) const override
|
||||
void *init_storage(Pools &pools) const override
|
||||
{
|
||||
Storage *s = allocator.construct<Storage>().release();
|
||||
s->graph_executor_storage = graph_executor_->init_storage(allocator);
|
||||
Storage *s = pools.local->construct<Storage>().release();
|
||||
s->graph_executor_storage = graph_executor_->init_storage(pools);
|
||||
return s;
|
||||
}
|
||||
|
||||
void destruct_storage(void *storage, LocalPool<> &allocator) const override
|
||||
void destruct_storage(void *storage, Pools &pools) const override
|
||||
{
|
||||
Storage *s = static_cast<Storage *>(storage);
|
||||
graph_executor_->destruct_storage(s->graph_executor_storage, allocator);
|
||||
graph_executor_->destruct_storage(s->graph_executor_storage, pools);
|
||||
std::destroy_at(s);
|
||||
}
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue