diff --git a/source/blender/blenlib/BLI_local_allocator.hh b/source/blender/blenlib/BLI_local_allocator.hh new file mode 100644 index 00000000000..efc2c7f616d --- /dev/null +++ b/source/blender/blenlib/BLI_local_allocator.hh @@ -0,0 +1,331 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include + +#include "BLI_allocator.hh" +#include "BLI_asan.h" +#include "BLI_enumerable_thread_specific.hh" +#include "BLI_linear_allocator.hh" +#include "BLI_map.hh" +#include "BLI_math_bits.h" +#include "BLI_stack.hh" +#include "BLI_utility_mixins.hh" +#include "BLI_vector.hh" + +// #define BLI_LOCAL_ALLOCATOR_USE_GUARDED +// #define BLI_LOCAL_ALLOCATOR_DEBUG_SIZES + +namespace blender { + +class LocalAllocatorSet; +class LocalAllocator; +class LocalAllocatorPool; + +class LocalAllocatorPool : NonCopyable, NonMovable { + private: + Stack buffers; + int64_t element_size = -1; + int64_t alignment = -1; + + friend LocalAllocator; +}; + +class LocalAllocator : NonCopyable, NonMovable { + private: + static constexpr int64_t s_alignment = 64; + static constexpr int64_t s_global_allocation_threshold = 5 * 1024 * 1024; + LocalAllocatorSet &owner_set_; + AlignedBuffer<256, 64> initial_buffer_; + LinearAllocator<> linear_allocator_; + + struct Head { + int64_t buffer_size; + int64_t buffer_alignment; + }; + static_assert(is_power_of_2_constexpr(sizeof(Head))); + + std::array small_buffer_pools_; + Map> large_buffer_pools_; + + friend LocalAllocatorSet; + + LocalAllocator(LocalAllocatorSet &owner_set); + + public: + ~LocalAllocator(); + + bool is_local() const; + LocalAllocator &local(); + LocalAllocatorSet &owner_set(); + + void *allocate(int64_t size, int64_t alignment); + void deallocate(const void *buffer, int64_t size, int64_t alignment); + + void *allocate(LocalAllocatorPool &pool); + void deallocate(const void *buffer, LocalAllocatorPool &pool); + + void *allocate_with_head(int64_t size, int64_t alignment); + void deallocate_with_head(const void *buffer); + + LocalAllocatorPool &get_pool(int64_t size, int64_t alignment); + + template T &allocate_new(Args &&...args); + template void destruct_free(const T *value); + template MutableSpan allocate_array(int64_t size); + template + MutableSpan allocate_new_array(int64_t size, Args &&...args); + template void destruct_free_array(Span data); + template void destruct_free_array(MutableSpan data); +}; + +class LocalAllocatorSet : NonCopyable, NonMovable { + private: + threading::EnumerableThreadSpecific allocator_by_thread_; + +#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES + std::mutex debug_sizes_mutex_; + Map debug_sizes_; +#endif + + friend LocalAllocator; + + public: + LocalAllocatorSet(); + ~LocalAllocatorSet(); + + LocalAllocator &local(); +}; + +class ThreadedLocalAllocatorRef { + private: + LocalAllocatorSet &allocator_set_; + + public: + ThreadedLocalAllocatorRef(LocalAllocator &allocator) : allocator_set_(allocator.owner_set()) + { + } + + void *allocate(const size_t size, const size_t alignment, const char * /*name*/) + { + LocalAllocator &allocator = allocator_set_.local(); + return allocator.allocate_with_head(size, alignment); + } + + void deallocate(void *ptr) + { + LocalAllocator &allocator = allocator_set_.local(); + allocator.deallocate_with_head(ptr); + } +}; + +class LocalAllocatorRef { + private: + LocalAllocator &allocator_; + + public: + LocalAllocatorRef(LocalAllocator &allocator) : allocator_(allocator) + { + } + + void *allocate(const size_t size, const size_t alignment, const char * /*name*/) + { + return allocator_.allocate_with_head(size, alignment); + } + + void deallocate(void *ptr) + { + allocator_.deallocate_with_head(ptr); + } +}; + +inline bool LocalAllocator::is_local() const +{ + return this == &owner_set_.local(); +} + +inline LocalAllocator &LocalAllocator::local() +{ + return owner_set_.local(); +} + +inline LocalAllocatorSet &LocalAllocator::owner_set() +{ + return owner_set_; +} + +inline void *LocalAllocator::allocate(const int64_t size, const int64_t alignment) +{ + LocalAllocatorPool &pool = this->get_pool(size, alignment); + BLI_assert(pool.element_size >= size); + BLI_assert(pool.alignment >= alignment); + + return this->allocate(pool); +} + +inline void LocalAllocator::deallocate(const void *buffer, + const int64_t size, + const int64_t alignment) +{ + LocalAllocatorPool &pool = this->get_pool(size, alignment); + BLI_assert(pool.element_size >= size); + BLI_assert(pool.alignment >= alignment); + + this->deallocate(buffer, pool); +} + +inline void *LocalAllocator::allocate(LocalAllocatorPool &pool) +{ + BLI_assert(this->is_local()); + +#ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED + return MEM_mallocN_aligned(pool.element_size, pool.alignment, __func__); +#endif + + void *buffer; + if (!pool.buffers.is_empty()) { + buffer = pool.buffers.pop(); + BLI_asan_unpoison(buffer, pool.element_size); + } + else if (pool.element_size < s_global_allocation_threshold) { + buffer = linear_allocator_.allocate(pool.element_size, pool.alignment); + } + else { + buffer = MEM_mallocN(pool.element_size, __func__); + } + +#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES + { + std::lock_guard lock{owner_set_.debug_sizes_mutex_}; + owner_set_.debug_sizes_.add_new(buffer, pool.element_size); + } +#endif + + return buffer; +} + +inline void LocalAllocator::deallocate(const void *buffer, LocalAllocatorPool &pool) +{ + BLI_assert(this->is_local()); + +#ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED + MEM_freeN(const_cast(buffer)); + return; +#endif + +#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES + { + std::lock_guard lock{owner_set_.debug_sizes_mutex_}; + auto [last_size, last_alignment] = owner_set_.debug_sizes_.pop(buffer); + if (last_size != size) { + BLI_assert_unreachable(); + } + if (last_alignment != alignment) { + BLI_assert_unreachable(); + } + } +#endif + +#ifdef DEBUG + memset(const_cast(buffer), -1, pool.element_size); +#endif + + if (pool.element_size < s_global_allocation_threshold) { + BLI_asan_poison(buffer, pool.element_size); + pool.buffers.push(const_cast(buffer)); + } + else { + MEM_freeN(const_cast(buffer)); + } +} + +inline LocalAllocatorPool &LocalAllocator::get_pool(const int64_t size, const int64_t alignment) +{ + BLI_assert(size > 0); + BLI_assert(alignment <= size); + BLI_assert(alignment <= s_alignment); + BLI_assert(is_power_of_2_i(alignment)); + UNUSED_VARS_NDEBUG(alignment); + + BLI_assert(this->is_local()); + if (size <= 64) { + return small_buffer_pools_[(size - 1) >> 3]; + } + const int key = bitscan_reverse_uint64(uint64_t(size)); + return *large_buffer_pools_.lookup_or_add_cb(key, [&]() { + auto pool = std::make_unique(); + pool->element_size = int64_t(1) << (8 * sizeof(int64_t) - key); + pool->alignment = s_alignment; + return pool; + }); +} + +inline void *LocalAllocator::allocate_with_head(int64_t size, int64_t alignment) +{ + const int64_t buffer_size = size + std::max(alignment, sizeof(Head)); + const int64_t buffer_alignment = std::max(alignment, alignof(Head)); + void *buffer = this->allocate(buffer_size, buffer_alignment); + Head *head = new (buffer) Head; + head->buffer_size = buffer_size; + head->buffer_alignment = buffer_alignment; + return head + 1; +} + +inline void LocalAllocator::deallocate_with_head(const void *buffer) +{ + const Head *head = static_cast(buffer) - 1; + this->deallocate(head, head->buffer_size, head->buffer_alignment); +} + +template inline T &LocalAllocator::allocate_new(Args &&...args) +{ + void *buffer = this->allocate(sizeof(T), alignof(T)); + T *value = new (buffer) T(std::forward(args)...); + return *value; +} + +template inline void LocalAllocator::destruct_free(const T *value) +{ + std::destroy_at(value); + this->deallocate(value, sizeof(T), alignof(T)); +} + +template MutableSpan inline LocalAllocator::allocate_array(const int64_t size) +{ + if (size == 0) { + return {}; + } + void *buffer = this->allocate(size * sizeof(T), alignof(T)); + return {static_cast(buffer), size}; +} + +template +MutableSpan inline LocalAllocator::allocate_new_array(const int64_t size, Args &&...args) +{ + MutableSpan array = this->allocate_array(size); + for (const int64_t i : IndexRange(size)) { + new (&array[i]) T(std::forward(args)...); + } + return array; +} + +template inline void LocalAllocator::destruct_free_array(Span data) +{ + if (data.is_empty()) { + return; + } + destruct_n(const_cast(data.data()), data.size()); + this->deallocate(data.data(), data.size_in_bytes(), alignof(T)); +} + +template inline void LocalAllocator::destruct_free_array(MutableSpan data) +{ + this->destruct_free_array(data.as_span()); +} + +inline LocalAllocator &LocalAllocatorSet::local() +{ + return allocator_by_thread_.local(); +} + +} // namespace blender diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt index 7c24fb8b5e7..d9a287679f5 100644 --- a/source/blender/blenlib/CMakeLists.txt +++ b/source/blender/blenlib/CMakeLists.txt @@ -91,6 +91,7 @@ set(SRC intern/lazy_threading.cc intern/length_parameterize.cc intern/listbase.cc + intern/local_allocator.cc intern/math_base.c intern/math_base_inline.c intern/math_base_safe_inline.c @@ -256,6 +257,7 @@ set(SRC BLI_linklist_stack.h BLI_listbase.h BLI_listbase_wrapper.hh + BLI_local_allocator.hh BLI_map.hh BLI_map_slots.hh BLI_math.h @@ -484,6 +486,7 @@ if(WITH_GTESTS) tests/BLI_linear_allocator_test.cc tests/BLI_linklist_lockfree_test.cc tests/BLI_listbase_test.cc + tests/BLI_local_allocator_test.cc tests/BLI_map_test.cc tests/BLI_math_base_safe_test.cc tests/BLI_math_base_test.cc diff --git a/source/blender/blenlib/intern/local_allocator.cc b/source/blender/blenlib/intern/local_allocator.cc new file mode 100644 index 00000000000..73015d3cff5 --- /dev/null +++ b/source/blender/blenlib/intern/local_allocator.cc @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "BLI_local_allocator.hh" + +namespace blender { + +LocalAllocatorSet::LocalAllocatorSet() + : allocator_by_thread_([this]() { return LocalAllocator{*this}; }) +{ +} + +LocalAllocatorSet::~LocalAllocatorSet() = default; + +LocalAllocator::LocalAllocator(LocalAllocatorSet &owner_set) : owner_set_(owner_set) +{ + linear_allocator_.provide_buffer(initial_buffer_); + for (const int64_t i : IndexRange(small_buffer_pools_.size())) { + LocalAllocatorPool &pool = small_buffer_pools_[i]; + pool.element_size = 8 * (i + 1); + pool.alignment = power_of_2_min_u(pool.element_size); + } +} + +LocalAllocator::~LocalAllocator() = default; + +} // namespace blender diff --git a/source/blender/blenlib/tests/BLI_local_allocator_test.cc b/source/blender/blenlib/tests/BLI_local_allocator_test.cc new file mode 100644 index 00000000000..1e7feca5b39 --- /dev/null +++ b/source/blender/blenlib/tests/BLI_local_allocator_test.cc @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: Apache-2.0 */ + +#include "BLI_local_allocator.hh" +#include "BLI_strict_flags.h" + +#include "testing/testing.h" + +namespace blender::tests { + +} // namespace blender::tests diff --git a/source/blender/functions/FN_lazy_function.hh b/source/blender/functions/FN_lazy_function.hh index 5379e5f8506..a3f6987573a 100644 --- a/source/blender/functions/FN_lazy_function.hh +++ b/source/blender/functions/FN_lazy_function.hh @@ -42,6 +42,7 @@ #include "BLI_function_ref.hh" #include "BLI_generic_pointer.hh" #include "BLI_linear_allocator.hh" +#include "BLI_local_allocator.hh" #include "BLI_vector.hh" #include @@ -98,6 +99,8 @@ struct Context { * Custom user data that can be used in the function. */ UserData *user_data; + + LocalAllocator *allocator; }; /** @@ -276,12 +279,12 @@ class LazyFunction { * Allocates storage for this function. The storage will be passed to every call to #execute. * If the function does not keep track of any state, this does not have to be implemented. */ - virtual void *init_storage(LinearAllocator<> &allocator) const; + virtual void *init_storage(LocalAllocator &allocator) const; /** * Destruct the storage created in #init_storage. */ - virtual void destruct_storage(void *storage) const; + virtual void destruct_storage(void *storage, LocalAllocator &allocator) const; /** * Calls `fn` with the input indices that the given `output_index` may depend on. By default diff --git a/source/blender/functions/FN_lazy_function_execute.hh b/source/blender/functions/FN_lazy_function_execute.hh index 5785fddaa51..c604ba19deb 100644 --- a/source/blender/functions/FN_lazy_function_execute.hh +++ b/source/blender/functions/FN_lazy_function_execute.hh @@ -85,14 +85,16 @@ inline void execute_lazy_function_eagerly_impl( ...); output_usages.fill(ValueUsage::Used); set_outputs.fill(false); - LinearAllocator<> allocator; + LocalAllocatorSet allocator_set; + LocalAllocator &allocator = allocator_set.local(); Context context; context.user_data = user_data; context.storage = fn.init_storage(allocator); + context.allocator = &allocator; BasicParams params{ fn, input_pointers, output_pointers, input_usages, output_usages, set_outputs}; fn.execute(params, context); - fn.destruct_storage(context.storage); + fn.destruct_storage(context.storage, allocator); /* Make sure all outputs have been computed. */ BLI_assert(!Span(set_outputs).contains(false)); diff --git a/source/blender/functions/FN_lazy_function_graph_executor.hh b/source/blender/functions/FN_lazy_function_graph_executor.hh index a6ae5cac967..a13ca907c21 100644 --- a/source/blender/functions/FN_lazy_function_graph_executor.hh +++ b/source/blender/functions/FN_lazy_function_graph_executor.hh @@ -59,11 +59,23 @@ class GraphExecutor : public LazyFunction { using Logger = GraphExecutorLogger; using SideEffectProvider = GraphExecutorSideEffectProvider; + struct NodeBufferOffsets { + int node; + int inputs; + int outputs; + }; + + struct PreprocessData { + Array offsets; + int node_state_buffer_size; + }; + private: /** * The graph that is evaluated. */ const Graph &graph_; + const PreprocessData &preprocess_data_; /** * Input and output sockets of the entire graph. */ @@ -85,11 +97,14 @@ class GraphExecutor : public LazyFunction { GraphExecutor(const Graph &graph, Span graph_inputs, Span graph_outputs, + const PreprocessData &preprocess_data, const Logger *logger, const SideEffectProvider *side_effect_provider); - void *init_storage(LinearAllocator<> &allocator) const override; - void destruct_storage(void *storage) const override; + void *init_storage(LocalAllocator &allocator) const override; + void destruct_storage(void *storage, LocalAllocator &allocator) const override; + + static void preprocess(const Graph &graph, PreprocessData &r_preprocess_data); private: void execute_impl(Params ¶ms, const Context &context) const override; diff --git a/source/blender/functions/intern/lazy_function.cc b/source/blender/functions/intern/lazy_function.cc index d42b1889160..e5d7481f4d9 100644 --- a/source/blender/functions/intern/lazy_function.cc +++ b/source/blender/functions/intern/lazy_function.cc @@ -25,12 +25,12 @@ std::string LazyFunction::output_name(int index) const return outputs_[index].debug_name; } -void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const +void *LazyFunction::init_storage(LocalAllocator & /*allocator*/) const { return nullptr; } -void LazyFunction::destruct_storage(void *storage) const +void LazyFunction::destruct_storage(void *storage, LocalAllocator & /*allocator*/) const { BLI_assert(storage == nullptr); UNUSED_VARS_NDEBUG(storage); diff --git a/source/blender/functions/intern/lazy_function_graph_executor.cc b/source/blender/functions/intern/lazy_function_graph_executor.cc index 4270885fc98..55efecab4a9 100644 --- a/source/blender/functions/intern/lazy_function_graph_executor.cc +++ b/source/blender/functions/intern/lazy_function_graph_executor.cc @@ -75,7 +75,7 @@ enum class NodeScheduleState { RunningAndRescheduled, }; -struct InputState { +struct alignas(8) InputState { /** * Value of this input socket. By default, the value is empty. When other nodes are done * computing their outputs, the computed values will be forwarded to linked input sockets. The @@ -97,7 +97,7 @@ struct InputState { bool was_ready_for_execution = false; }; -struct OutputState { +struct alignas(8) OutputState { /** * Keeps track of how the output value is used. If a connected input becomes used, this output * has to become used as well. The output becomes unused when it is used by no input socket @@ -127,7 +127,7 @@ struct OutputState { void *value = nullptr; }; -struct NodeState { +struct alignas(8) NodeState { /** * Needs to be locked when any data in this state is accessed that is not explicitly marked as * not needing the lock. @@ -271,7 +271,7 @@ class Executor { /** * State of every node, indexed by #Node::index_in_graph. */ - Array node_states_; + MutableSpan node_states_; /** * Parameters provided by the caller. This is always non-null, while a node is running. */ @@ -285,15 +285,7 @@ class Executor { #ifdef FN_LAZY_FUNCTION_DEBUG_THREADS std::thread::id current_main_thread_; #endif - /** - * A separate linear allocator for every thread. We could potentially reuse some memory, but that - * doesn't seem worth it yet. - */ - struct ThreadLocalData { - LinearAllocator<> allocator; - }; - std::unique_ptr> thread_locals_; - LinearAllocator<> main_allocator_; + /** * Set to false when the first execution ends. */ @@ -308,18 +300,25 @@ class Executor { BLI_assert(self_.graph_.node_indices_are_valid()); } - ~Executor() + void destruct_state(LocalAllocator &allocator) { if (TaskPool *task_pool = task_pool_.load()) { BLI_task_pool_free(task_pool); } threading::parallel_for(node_states_.index_range(), 1024, [&](const IndexRange range) { + LocalAllocator &local_allocator = allocator.local(); for (const int node_index : range) { const Node &node = *self_.graph_.nodes()[node_index]; NodeState &node_state = *node_states_[node_index]; - this->destruct_node_state(node, node_state); + if (!node_state.node_has_finished) { + this->destruct_node_data(node, node_state, local_allocator); + } + std::destroy_at(&node_state); } }); + allocator.deallocate( + node_states_[0], self_.preprocess_data_.node_state_buffer_size, alignof(NodeState)); + allocator.destruct_free_array(node_states_); } /** @@ -364,7 +363,7 @@ class Executor { } } - this->initialize_static_value_usages(side_effect_nodes); + this->initialize_static_value_usages(side_effect_nodes, this->get_local_allocator()); this->schedule_side_effect_nodes(side_effect_nodes, current_task); } @@ -382,54 +381,41 @@ class Executor { void initialize_node_states() { Span nodes = self_.graph_.nodes(); - node_states_.reinitialize(nodes.size()); + node_states_ = context_->allocator->allocate_array(nodes.size()); - auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) { - for (const int i : range) { - const Node &node = *nodes[i]; - NodeState &node_state = *allocator.construct().release(); - node_states_[i] = &node_state; - this->construct_initial_node_state(allocator, node, node_state); - } - }; - if (nodes.size() <= 256) { - construct_node_range(nodes.index_range(), main_allocator_); - } - else { - this->ensure_thread_locals(); - /* Construct all node states in parallel. */ - threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) { - LinearAllocator<> &allocator = thread_locals_->local().allocator; - construct_node_range(range, allocator); - }); + void *node_states_buffer = context_->allocator->allocate( + self_.preprocess_data_.node_state_buffer_size, alignof(NodeState)); + + for (const int i : nodes.index_range()) { + const Node &node = *nodes[i]; + const GraphExecutor::NodeBufferOffsets &node_offsets = self_.preprocess_data_.offsets[i]; + void *state_buffer = POINTER_OFFSET(node_states_buffer, node_offsets.node); + NodeState *node_state = new (state_buffer) NodeState(); + node_state->inputs = { + static_cast(POINTER_OFFSET(node_states_buffer, node_offsets.inputs)), + node.inputs().size()}; + node_state->outputs = { + static_cast(POINTER_OFFSET(node_states_buffer, node_offsets.outputs)), + node.outputs().size()}; + default_construct_n(node_state->inputs.data(), node_state->inputs.size()); + default_construct_n(node_state->outputs.data(), node_state->outputs.size()); + node_states_[i] = node_state; } } - void construct_initial_node_state(LinearAllocator<> &allocator, - const Node &node, - NodeState &node_state) - { - const Span node_inputs = node.inputs(); - const Span node_outputs = node.outputs(); - - node_state.inputs = allocator.construct_array(node_inputs.size()); - node_state.outputs = allocator.construct_array(node_outputs.size()); - } - - void destruct_node_state(const Node &node, NodeState &node_state) + void destruct_node_data(const Node &node, NodeState &node_state, LocalAllocator &allocator) { if (node.is_function()) { const LazyFunction &fn = static_cast(node).function(); if (node_state.storage != nullptr) { - fn.destruct_storage(node_state.storage); + fn.destruct_storage(node_state.storage, allocator); } } for (const int i : node.inputs().index_range()) { InputState &input_state = node_state.inputs[i]; const InputSocket &input_socket = node.input(i); - this->destruct_input_value_if_exists(input_state, input_socket.type()); + this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator); } - std::destroy_at(&node_state); } /** @@ -453,7 +439,7 @@ class Executor { this->set_input_required(locked_node, socket); } else { - this->set_input_unused(locked_node, socket); + this->set_input_unused(locked_node, socket, this->get_local_allocator()); } }); } @@ -500,13 +486,14 @@ class Executor { * Most importantly, this function initializes `InputState.usage` and * `OutputState.potential_target_sockets`. */ - void initialize_static_value_usages(const Span side_effect_nodes) + void initialize_static_value_usages(const Span side_effect_nodes, + LocalAllocator &allocator) { const Span all_nodes = self_.graph_.nodes(); /* Used for a search through all nodes that outputs depend on. */ - Stack reachable_nodes_to_check; - Array reachable_node_flags(all_nodes.size(), false); + Stack reachable_nodes_to_check{allocator}; + Array reachable_node_flags{all_nodes.size(), false, allocator}; /* Graph outputs are always reachable. */ for (const InputSocket *socket : self_.graph_outputs_) { @@ -586,7 +573,7 @@ class Executor { void forward_newly_provided_inputs(CurrentTask ¤t_task) { - LinearAllocator<> &allocator = this->get_main_or_local_allocator(); + LocalAllocator &allocator = this->get_local_allocator(); for (const int graph_input_index : self_.graph_inputs_.index_range()) { std::atomic &was_loaded = loaded_inputs_[graph_input_index]; if (was_loaded.load()) { @@ -605,7 +592,7 @@ class Executor { } void forward_newly_provided_input(CurrentTask ¤t_task, - LinearAllocator<> &allocator, + LocalAllocator &allocator, const int graph_input_index, void *input_data) { @@ -621,7 +608,6 @@ class Executor { const Node &node = socket.node(); const int index_in_node = socket.index(); NodeState &node_state = *node_states_[node.index_in_graph()]; - OutputState &output_state = node_state.outputs[index_in_node]; /* The notified output socket might be an input of the entire graph. In this case, notify the * caller that the input is required. */ @@ -640,12 +626,13 @@ class Executor { return; } this->forward_newly_provided_input( - current_task, this->get_main_or_local_allocator(), graph_input_index, input_data); + current_task, this->get_local_allocator(), graph_input_index, input_data); return; } BLI_assert(node.is_function()); this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) { + OutputState &output_state = node_state.outputs[index_in_node]; if (output_state.usage == ValueUsage::Used) { return; } @@ -659,9 +646,9 @@ class Executor { const Node &node = socket.node(); const int index_in_node = socket.index(); NodeState &node_state = *node_states_[node.index_in_graph()]; - OutputState &output_state = node_state.outputs[index_in_node]; this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) { + OutputState &output_state = node_state.outputs[index_in_node]; output_state.potential_target_sockets -= 1; if (output_state.potential_target_sockets == 0) { BLI_assert(output_state.usage != ValueUsage::Unused); @@ -760,7 +747,7 @@ class Executor { void run_node_task(const FunctionNode &node, CurrentTask ¤t_task) { NodeState &node_state = *node_states_[node.index_in_graph()]; - LinearAllocator<> &allocator = this->get_main_or_local_allocator(); + LocalAllocator &allocator = this->get_local_allocator(); const LazyFunction &fn = node.function(); bool node_needs_execution = false; @@ -799,6 +786,7 @@ class Executor { node_state.always_used_inputs_requested = true; } + const bool allow_missing_requested_inputs = fn.allow_missing_requested_inputs(); for (const int input_index : node_state.inputs.index_range()) { InputState &input_state = node_state.inputs[input_index]; if (input_state.was_ready_for_execution) { @@ -808,7 +796,11 @@ class Executor { input_state.was_ready_for_execution = true; continue; } - if (!fn.allow_missing_requested_inputs()) { + const InputSocket &socket = node.input(input_index); + if (socket.origin() == nullptr) { + continue; + } + if (!allow_missing_requested_inputs) { if (input_state.usage == ValueUsage::Used) { return; } @@ -848,7 +840,7 @@ class Executor { /* Importantly, the node must not be locked when it is executed. That would result in locks * being hold very long in some cases and results in multiple locks being hold by the same * thread in the same graph which can lead to deadlocks. */ - this->execute_node(node, node_state, current_task); + this->execute_node(node, node_state, current_task, allocator); } this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) { @@ -857,7 +849,7 @@ class Executor { this->assert_expected_outputs_have_been_computed(locked_node); } #endif - this->finish_node_if_possible(locked_node); + this->finish_node_if_possible(locked_node, allocator); const bool reschedule_requested = node_state.schedule_state == NodeScheduleState::RunningAndRescheduled; node_state.schedule_state = NodeScheduleState::NotScheduled; @@ -895,7 +887,7 @@ class Executor { } } - void finish_node_if_possible(LockedNode &locked_node) + void finish_node_if_possible(LockedNode &locked_node, LocalAllocator &allocator) { const Node &node = locked_node.node; NodeState &node_state = locked_node.node_state; @@ -923,44 +915,44 @@ class Executor { const InputSocket &input_socket = node.input(input_index); InputState &input_state = node_state.inputs[input_index]; if (input_state.usage == ValueUsage::Maybe) { - this->set_input_unused(locked_node, input_socket); - } - else if (input_state.usage == ValueUsage::Used) { - this->destruct_input_value_if_exists(input_state, input_socket.type()); + this->set_input_unused(locked_node, input_socket, allocator); } } - if (node_state.storage != nullptr) { - if (node.is_function()) { - const FunctionNode &fn_node = static_cast(node); - fn_node.function().destruct_storage(node_state.storage); - } - node_state.storage = nullptr; - } + this->destruct_node_data(node, node_state, allocator); } - void destruct_input_value_if_exists(InputState &input_state, const CPPType &type) + void destruct_input_value_if_exists(InputState &input_state, + const CPPType &type, + LocalAllocator &allocator) { if (input_state.value != nullptr) { type.destruct(input_state.value); + allocator.deallocate(input_state.value, type.size(), type.alignment()); input_state.value = nullptr; } } - void execute_node(const FunctionNode &node, NodeState &node_state, CurrentTask ¤t_task); + void execute_node(const FunctionNode &node, + NodeState &node_state, + CurrentTask ¤t_task, + LocalAllocator &allocator); void set_input_unused_during_execution(const Node &node, NodeState &node_state, const int input_index, CurrentTask ¤t_task) { + LocalAllocator &allocator = this->get_local_allocator(); const InputSocket &input_socket = node.input(input_index); this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) { - this->set_input_unused(locked_node, input_socket); + this->set_input_unused(locked_node, input_socket, allocator); }); } - void set_input_unused(LockedNode &locked_node, const InputSocket &input_socket) + void set_input_unused(LockedNode &locked_node, + const InputSocket &input_socket, + LocalAllocator &allocator) { NodeState &node_state = locked_node.node_state; const int input_index = input_socket.index(); @@ -972,7 +964,7 @@ class Executor { } input_state.usage = ValueUsage::Unused; - this->destruct_input_value_if_exists(input_state, input_socket.type()); + this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator); if (input_state.was_ready_for_execution) { return; } @@ -1026,7 +1018,7 @@ class Executor { CurrentTask ¤t_task) { BLI_assert(value_to_forward.get() != nullptr); - LinearAllocator<> &allocator = this->get_main_or_local_allocator(); + LocalAllocator &allocator = this->get_local_allocator(); const CPPType &type = *value_to_forward.type(); if (self_.logger_ != nullptr) { @@ -1038,17 +1030,7 @@ class Executor { const Node &target_node = target_socket->node(); NodeState &node_state = *node_states_[target_node.index_in_graph()]; const int input_index = target_socket->index(); - InputState &input_state = node_state.inputs[input_index]; const bool is_last_target = target_socket == targets.last(); -#ifdef DEBUG - if (input_state.value != nullptr) { - if (self_.logger_ != nullptr) { - self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_); - } - BLI_assert_unreachable(); - } -#endif - BLI_assert(!input_state.was_ready_for_execution); BLI_assert(target_socket->type() == type); BLI_assert(target_socket->origin() == &from_socket); @@ -1072,6 +1054,18 @@ class Executor { continue; } this->with_locked_node(target_node, node_state, current_task, [&](LockedNode &locked_node) { + InputState &input_state = node_state.inputs[input_index]; + +#ifdef DEBUG + if (input_state.value != nullptr) { + if (self_.logger_ != nullptr) { + self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_); + } + BLI_assert_unreachable(); + } +#endif + BLI_assert(!input_state.was_ready_for_execution); + if (input_state.usage == ValueUsage::Unused) { return; } @@ -1089,6 +1083,7 @@ class Executor { } if (value_to_forward.get() != nullptr) { value_to_forward.destruct(); + allocator.deallocate(value_to_forward.get(), type.size(), type.alignment()); } } @@ -1145,23 +1140,10 @@ class Executor { if (BLI_system_thread_count() <= 1) { return false; } - this->ensure_thread_locals(); task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH)); return true; } - void ensure_thread_locals() - { -#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS - if (current_main_thread_ != std::this_thread::get_id()) { - BLI_assert_unreachable(); - } -#endif - if (!thread_locals_) { - thread_locals_ = std::make_unique>(); - } - } - /** * Allow other threads to steal all the nodes that are currently scheduled on this thread. */ @@ -1194,12 +1176,12 @@ class Executor { [](TaskPool * /*pool*/, void *data) { MEM_delete(static_cast(data)); }); } - LinearAllocator<> &get_main_or_local_allocator() + LocalAllocator &get_local_allocator() { if (this->use_multi_threading()) { - return thread_locals_->local().allocator; + return context_->allocator->local(); } - return main_allocator_; + return *context_->allocator; } }; @@ -1248,7 +1230,7 @@ class GraphExecutorLFParams final : public Params { OutputState &output_state = node_state_.outputs[index]; BLI_assert(!output_state.has_been_computed); if (output_state.value == nullptr) { - LinearAllocator<> &allocator = executor_.get_main_or_local_allocator(); + LocalAllocator &allocator = executor_.get_local_allocator(); const CPPType &type = node_.output(index).type(); output_state.value = allocator.allocate(type.size(), type.alignment()); } @@ -1297,13 +1279,15 @@ class GraphExecutorLFParams final : public Params { */ inline void Executor::execute_node(const FunctionNode &node, NodeState &node_state, - CurrentTask ¤t_task) + CurrentTask ¤t_task, + LocalAllocator &allocator) { const LazyFunction &fn = node.function(); GraphExecutorLFParams node_params{fn, *this, node, node_state, current_task}; BLI_assert(context_ != nullptr); Context fn_context = *context_; fn_context.storage = node_state.storage; + fn_context.allocator = &allocator; if (self_.logger_ != nullptr) { self_.logger_->log_before_node_execute(node, node_params, fn_context); @@ -1330,12 +1314,32 @@ inline void Executor::execute_node(const FunctionNode &node, } } +void GraphExecutor::preprocess(const Graph &graph, PreprocessData &r_preprocess_data) +{ + const Span nodes = graph.nodes(); + r_preprocess_data.offsets.reinitialize(nodes.size()); + int offset = 0; + for (const int i : nodes.index_range()) { + const Node &node = *nodes[i]; + NodeBufferOffsets &node_offsets = r_preprocess_data.offsets[i]; + node_offsets.node = offset; + offset += sizeof(NodeState); + node_offsets.inputs = offset; + offset += sizeof(InputState) * node.inputs().size(); + node_offsets.outputs = offset; + offset += sizeof(OutputState) * node.outputs().size(); + } + r_preprocess_data.node_state_buffer_size = offset; +} + GraphExecutor::GraphExecutor(const Graph &graph, const Span graph_inputs, const Span graph_outputs, + const PreprocessData &preprocess_data, const Logger *logger, const SideEffectProvider *side_effect_provider) : graph_(graph), + preprocess_data_(preprocess_data), graph_inputs_(graph_inputs), graph_outputs_(graph_outputs), logger_(logger), @@ -1360,15 +1364,17 @@ void GraphExecutor::execute_impl(Params ¶ms, const Context &context) const executor.execute(params, context); } -void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const +void *GraphExecutor::init_storage(LocalAllocator &allocator) const { - Executor &executor = *allocator.construct(*this).release(); + Executor &executor = allocator.allocate_new(*this); return &executor; } -void GraphExecutor::destruct_storage(void *storage) const +void GraphExecutor::destruct_storage(void *storage, LocalAllocator &allocator) const { - std::destroy_at(static_cast(storage)); + Executor *executor = static_cast(storage); + executor->destruct_state(allocator); + allocator.destruct_free(executor); } void GraphExecutorLogger::log_socket_value(const Socket &socket, diff --git a/source/blender/functions/tests/FN_lazy_function_test.cc b/source/blender/functions/tests/FN_lazy_function_test.cc index 54e1df00cdf..9776a6b4d77 100644 --- a/source/blender/functions/tests/FN_lazy_function_test.cc +++ b/source/blender/functions/tests/FN_lazy_function_test.cc @@ -105,7 +105,11 @@ TEST(lazy_function, SideEffects) SimpleSideEffectProvider side_effect_provider{{&store_node}}; - GraphExecutor executor_fn{graph, {&input_node.output(0)}, {}, nullptr, &side_effect_provider}; + GraphExecutor::PreprocessData preprocess_data; + GraphExecutor::preprocess(graph, preprocess_data); + + GraphExecutor executor_fn{ + graph, {&input_node.output(0)}, {}, preprocess_data, nullptr, &side_effect_provider}; execute_lazy_function_eagerly(executor_fn, nullptr, std::make_tuple(5), std::make_tuple()); EXPECT_EQ(dst1, 15); @@ -167,8 +171,11 @@ TEST(lazy_function, GraphWithCycle) graph.update_node_indices(); + GraphExecutor::PreprocessData preprocess_data; + GraphExecutor::preprocess(graph, preprocess_data); + GraphExecutor executor_fn{ - graph, {&input_node.output(0)}, {&output_node.input(0)}, nullptr, nullptr}; + graph, {&input_node.output(0)}, {&output_node.input(0)}, preprocess_data, nullptr, nullptr}; int result = 0; execute_lazy_function_eagerly( executor_fn, nullptr, std::make_tuple(10), std::make_tuple(&result)); diff --git a/source/blender/modifiers/intern/MOD_nodes.cc b/source/blender/modifiers/intern/MOD_nodes.cc index 15aab38bed7..d8900a05bc1 100644 --- a/source/blender/modifiers/intern/MOD_nodes.cc +++ b/source/blender/modifiers/intern/MOD_nodes.cc @@ -1146,8 +1146,12 @@ static GeometrySet compute_geometry( blender::nodes::GeometryNodesLazyFunctionLogger lf_logger(lf_graph_info); blender::nodes::GeometryNodesLazyFunctionSideEffectProvider lf_side_effect_provider; - lf::GraphExecutor graph_executor{ - lf_graph_info.graph, graph_inputs, graph_outputs, &lf_logger, &lf_side_effect_provider}; + lf::GraphExecutor graph_executor{lf_graph_info.graph, + graph_inputs, + graph_outputs, + lf_graph_info.graph_preprocess_data, + &lf_logger, + &lf_side_effect_provider}; blender::nodes::GeoNodesModifierData geo_nodes_modifier_data; geo_nodes_modifier_data.depsgraph = ctx->depsgraph; @@ -1169,7 +1173,9 @@ static GeometrySet compute_geometry( blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name}; user_data.compute_context = &modifier_compute_context; - blender::LinearAllocator<> allocator; + blender::LocalAllocatorSet allocator_set; + blender::LocalAllocator &allocator = allocator_set.local(); + Vector inputs_to_destruct; int input_index = -1; @@ -1212,6 +1218,7 @@ static GeometrySet compute_geometry( lf::Context lf_context; lf_context.storage = graph_executor.init_storage(allocator); lf_context.user_data = &user_data; + lf_context.allocator = &allocator; lf::BasicParams lf_params{graph_executor, param_inputs, param_outputs, @@ -1219,7 +1226,7 @@ static GeometrySet compute_geometry( param_output_usages, param_set_outputs}; graph_executor.execute(lf_params, lf_context); - graph_executor.destruct_storage(lf_context.storage); + graph_executor.destruct_storage(lf_context.storage, allocator); for (GMutablePointer &ptr : inputs_to_destruct) { ptr.destruct(); @@ -1289,6 +1296,7 @@ static void modifyGeometry(ModifierData *md, const ModifierEvalContext *ctx, GeometrySet &geometry_set) { + SCOPED_TIMER_AVERAGED(__func__); NodesModifierData *nmd = reinterpret_cast(md); if (nmd->node_group == nullptr) { return; diff --git a/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh b/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh index 793798c4974..159db3bd477 100644 --- a/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh +++ b/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh @@ -187,6 +187,7 @@ struct GeometryNodesLazyFunctionGraphInfo { * Mappings between the lazy-function graph and the #bNodeTree. */ GeometryNodeLazyFunctionGraphMapping mapping; + lf::GraphExecutor::PreprocessData graph_preprocess_data; /** * Approximate number of nodes in the graph if all sub-graphs were inlined. * This can be used as a simple heuristic for the complexity of the node group. diff --git a/source/blender/nodes/intern/geometry_nodes_lazy_function.cc b/source/blender/nodes/intern/geometry_nodes_lazy_function.cc index 1a7f1520b12..ad4a3d0168d 100644 --- a/source/blender/nodes/intern/geometry_nodes_lazy_function.cc +++ b/source/blender/nodes/intern/geometry_nodes_lazy_function.cc @@ -769,6 +769,7 @@ class LazyFunctionForGroupNode : public LazyFunction { graph_executor_.emplace(lf_graph_info.graph, std::move(graph_inputs), std::move(graph_outputs), + lf_graph_info.graph_preprocess_data, &*lf_logger_, &*lf_side_effect_provider_); } @@ -805,18 +806,18 @@ class LazyFunctionForGroupNode : public LazyFunction { graph_executor_->execute(params, group_context); } - void *init_storage(LinearAllocator<> &allocator) const override + void *init_storage(LocalAllocator &allocator) const override { - Storage *s = allocator.construct().release(); - s->graph_executor_storage = graph_executor_->init_storage(allocator); - return s; + Storage &s = allocator.allocate_new(); + s.graph_executor_storage = graph_executor_->init_storage(allocator); + return &s; } - void destruct_storage(void *storage) const override + void destruct_storage(void *storage, LocalAllocator &allocator) const override { Storage *s = static_cast(storage); - graph_executor_->destruct_storage(s->graph_executor_storage); - std::destroy_at(s); + graph_executor_->destruct_storage(s->graph_executor_storage, allocator); + allocator.destruct_free(s); } std::string name() const override @@ -1243,6 +1244,7 @@ struct GeometryNodesLazyFunctionGraphBuilder { lf_graph_->update_node_indices(); lf_graph_info_->num_inline_nodes_approximate += lf_graph_->nodes().size(); + lf::GraphExecutor::preprocess(*lf_graph_, lf_graph_info_->graph_preprocess_data); } private: