WIP: Functions: new local allocator for better memory reuse and performance #104630

Draft
Jacques Lucke wants to merge 44 commits from JacquesLucke/blender:local-allocator into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
13 changed files with 548 additions and 134 deletions

View File

@ -0,0 +1,331 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma once
#include <cstddef>
#include "BLI_allocator.hh"
#include "BLI_asan.h"
#include "BLI_enumerable_thread_specific.hh"
#include "BLI_linear_allocator.hh"
#include "BLI_map.hh"
#include "BLI_math_bits.h"
#include "BLI_stack.hh"
#include "BLI_utility_mixins.hh"
#include "BLI_vector.hh"
// #define BLI_LOCAL_ALLOCATOR_USE_GUARDED
// #define BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
namespace blender {
class LocalAllocatorSet;
class LocalAllocator;
class LocalAllocatorPool;
class LocalAllocatorPool : NonCopyable, NonMovable {
private:
Stack<void *> buffers;
int64_t element_size = -1;
int64_t alignment = -1;
friend LocalAllocator;
};
class LocalAllocator : NonCopyable, NonMovable {
private:
static constexpr int64_t s_alignment = 64;
static constexpr int64_t s_global_allocation_threshold = 5 * 1024 * 1024;
LocalAllocatorSet &owner_set_;
AlignedBuffer<256, 64> initial_buffer_;
LinearAllocator<> linear_allocator_;
struct Head {
int64_t buffer_size;
int64_t buffer_alignment;
};
static_assert(is_power_of_2_constexpr(sizeof(Head)));
std::array<LocalAllocatorPool, 8> small_buffer_pools_;
Map<int, std::unique_ptr<LocalAllocatorPool>> large_buffer_pools_;
friend LocalAllocatorSet;
LocalAllocator(LocalAllocatorSet &owner_set);
public:
~LocalAllocator();
bool is_local() const;
LocalAllocator &local();
LocalAllocatorSet &owner_set();
void *allocate(int64_t size, int64_t alignment);
void deallocate(const void *buffer, int64_t size, int64_t alignment);
void *allocate(LocalAllocatorPool &pool);
void deallocate(const void *buffer, LocalAllocatorPool &pool);
void *allocate_with_head(int64_t size, int64_t alignment);
void deallocate_with_head(const void *buffer);
LocalAllocatorPool &get_pool(int64_t size, int64_t alignment);
template<typename T, typename... Args> T &allocate_new(Args &&...args);
template<typename T, typename... Args> void destruct_free(const T *value);
template<typename T> MutableSpan<T> allocate_array(int64_t size);
template<typename T, typename... Args>
MutableSpan<T> allocate_new_array(int64_t size, Args &&...args);
template<typename T> void destruct_free_array(Span<T> data);
template<typename T> void destruct_free_array(MutableSpan<T> data);
};
class LocalAllocatorSet : NonCopyable, NonMovable {
private:
threading::EnumerableThreadSpecific<LocalAllocator> allocator_by_thread_;
#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
std::mutex debug_sizes_mutex_;
Map<const void *, int64_t> debug_sizes_;
#endif
friend LocalAllocator;
public:
LocalAllocatorSet();
~LocalAllocatorSet();
LocalAllocator &local();
};
class ThreadedLocalAllocatorRef {
private:
LocalAllocatorSet &allocator_set_;
public:
ThreadedLocalAllocatorRef(LocalAllocator &allocator) : allocator_set_(allocator.owner_set())
{
}
void *allocate(const size_t size, const size_t alignment, const char * /*name*/)
{
LocalAllocator &allocator = allocator_set_.local();
return allocator.allocate_with_head(size, alignment);
}
void deallocate(void *ptr)
{
LocalAllocator &allocator = allocator_set_.local();
allocator.deallocate_with_head(ptr);
}
};
class LocalAllocatorRef {
private:
LocalAllocator &allocator_;
public:
LocalAllocatorRef(LocalAllocator &allocator) : allocator_(allocator)
{
}
void *allocate(const size_t size, const size_t alignment, const char * /*name*/)
{
return allocator_.allocate_with_head(size, alignment);
}
void deallocate(void *ptr)
{
allocator_.deallocate_with_head(ptr);
}
};
inline bool LocalAllocator::is_local() const
{
return this == &owner_set_.local();
}
inline LocalAllocator &LocalAllocator::local()
{
return owner_set_.local();
}
inline LocalAllocatorSet &LocalAllocator::owner_set()
{
return owner_set_;
}
inline void *LocalAllocator::allocate(const int64_t size, const int64_t alignment)
{
LocalAllocatorPool &pool = this->get_pool(size, alignment);
BLI_assert(pool.element_size >= size);
BLI_assert(pool.alignment >= alignment);
return this->allocate(pool);
}
inline void LocalAllocator::deallocate(const void *buffer,
const int64_t size,
const int64_t alignment)
{
LocalAllocatorPool &pool = this->get_pool(size, alignment);
BLI_assert(pool.element_size >= size);
BLI_assert(pool.alignment >= alignment);
this->deallocate(buffer, pool);
}
inline void *LocalAllocator::allocate(LocalAllocatorPool &pool)
{
BLI_assert(this->is_local());
#ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED
return MEM_mallocN_aligned(pool.element_size, pool.alignment, __func__);
#endif
void *buffer;
if (!pool.buffers.is_empty()) {
buffer = pool.buffers.pop();
BLI_asan_unpoison(buffer, pool.element_size);
}
else if (pool.element_size < s_global_allocation_threshold) {
buffer = linear_allocator_.allocate(pool.element_size, pool.alignment);
}
else {
buffer = MEM_mallocN(pool.element_size, __func__);
}
#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
{
std::lock_guard lock{owner_set_.debug_sizes_mutex_};
owner_set_.debug_sizes_.add_new(buffer, pool.element_size);
}
#endif
return buffer;
}
inline void LocalAllocator::deallocate(const void *buffer, LocalAllocatorPool &pool)
{
BLI_assert(this->is_local());
#ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED
MEM_freeN(const_cast<void *>(buffer));
return;
#endif
#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
{
std::lock_guard lock{owner_set_.debug_sizes_mutex_};
auto [last_size, last_alignment] = owner_set_.debug_sizes_.pop(buffer);
if (last_size != size) {
BLI_assert_unreachable();
}
if (last_alignment != alignment) {
BLI_assert_unreachable();
}
}
#endif
#ifdef DEBUG
memset(const_cast<void *>(buffer), -1, pool.element_size);
#endif
if (pool.element_size < s_global_allocation_threshold) {
BLI_asan_poison(buffer, pool.element_size);
pool.buffers.push(const_cast<void *>(buffer));
}
else {
MEM_freeN(const_cast<void *>(buffer));
}
}
inline LocalAllocatorPool &LocalAllocator::get_pool(const int64_t size, const int64_t alignment)
{
BLI_assert(size > 0);
BLI_assert(alignment <= size);
BLI_assert(alignment <= s_alignment);
BLI_assert(is_power_of_2_i(alignment));
UNUSED_VARS_NDEBUG(alignment);
BLI_assert(this->is_local());
if (size <= 64) {
return small_buffer_pools_[(size - 1) >> 3];
}
const int key = bitscan_reverse_uint64(uint64_t(size));
return *large_buffer_pools_.lookup_or_add_cb(key, [&]() {
auto pool = std::make_unique<LocalAllocatorPool>();
pool->element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
pool->alignment = s_alignment;
return pool;
});
}
inline void *LocalAllocator::allocate_with_head(int64_t size, int64_t alignment)
{
const int64_t buffer_size = size + std::max<int64_t>(alignment, sizeof(Head));
const int64_t buffer_alignment = std::max<int64_t>(alignment, alignof(Head));
void *buffer = this->allocate(buffer_size, buffer_alignment);
Head *head = new (buffer) Head;
head->buffer_size = buffer_size;
head->buffer_alignment = buffer_alignment;
return head + 1;
}
inline void LocalAllocator::deallocate_with_head(const void *buffer)
{
const Head *head = static_cast<const Head *>(buffer) - 1;
this->deallocate(head, head->buffer_size, head->buffer_alignment);
}
template<typename T, typename... Args> inline T &LocalAllocator::allocate_new(Args &&...args)
{
void *buffer = this->allocate(sizeof(T), alignof(T));
T *value = new (buffer) T(std::forward<Args>(args)...);
return *value;
}
template<typename T, typename... Args> inline void LocalAllocator::destruct_free(const T *value)
{
std::destroy_at(value);
this->deallocate(value, sizeof(T), alignof(T));
}
template<typename T> MutableSpan<T> inline LocalAllocator::allocate_array(const int64_t size)
{
if (size == 0) {
return {};
}
void *buffer = this->allocate(size * sizeof(T), alignof(T));
return {static_cast<T *>(buffer), size};
}
template<typename T, typename... Args>
MutableSpan<T> inline LocalAllocator::allocate_new_array(const int64_t size, Args &&...args)
{
MutableSpan<T> array = this->allocate_array<T>(size);
for (const int64_t i : IndexRange(size)) {
new (&array[i]) T(std::forward<Args>(args)...);
}
return array;
}
template<typename T> inline void LocalAllocator::destruct_free_array(Span<T> data)
{
if (data.is_empty()) {
return;
}
destruct_n(const_cast<T *>(data.data()), data.size());
this->deallocate(data.data(), data.size_in_bytes(), alignof(T));
}
template<typename T> inline void LocalAllocator::destruct_free_array(MutableSpan<T> data)
{
this->destruct_free_array(data.as_span());
}
inline LocalAllocator &LocalAllocatorSet::local()
{
return allocator_by_thread_.local();
}
} // namespace blender

View File

@ -91,6 +91,7 @@ set(SRC
intern/lazy_threading.cc
intern/length_parameterize.cc
intern/listbase.cc
intern/local_allocator.cc
intern/math_base.c
intern/math_base_inline.c
intern/math_base_safe_inline.c
@ -256,6 +257,7 @@ set(SRC
BLI_linklist_stack.h
BLI_listbase.h
BLI_listbase_wrapper.hh
BLI_local_allocator.hh
BLI_map.hh
BLI_map_slots.hh
BLI_math.h
@ -484,6 +486,7 @@ if(WITH_GTESTS)
tests/BLI_linear_allocator_test.cc
tests/BLI_linklist_lockfree_test.cc
tests/BLI_listbase_test.cc
tests/BLI_local_allocator_test.cc
tests/BLI_map_test.cc
tests/BLI_math_base_safe_test.cc
tests/BLI_math_base_test.cc

View File

@ -0,0 +1,26 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include "BLI_local_allocator.hh"
namespace blender {
LocalAllocatorSet::LocalAllocatorSet()
: allocator_by_thread_([this]() { return LocalAllocator{*this}; })
{
}
LocalAllocatorSet::~LocalAllocatorSet() = default;
LocalAllocator::LocalAllocator(LocalAllocatorSet &owner_set) : owner_set_(owner_set)
{
linear_allocator_.provide_buffer(initial_buffer_);
for (const int64_t i : IndexRange(small_buffer_pools_.size())) {
LocalAllocatorPool &pool = small_buffer_pools_[i];
pool.element_size = 8 * (i + 1);
pool.alignment = power_of_2_min_u(pool.element_size);
}
}
LocalAllocator::~LocalAllocator() = default;
} // namespace blender

View File

@ -0,0 +1,10 @@
/* SPDX-License-Identifier: Apache-2.0 */
#include "BLI_local_allocator.hh"
#include "BLI_strict_flags.h"
#include "testing/testing.h"
namespace blender::tests {
} // namespace blender::tests

View File

@ -42,6 +42,7 @@
#include "BLI_function_ref.hh"
#include "BLI_generic_pointer.hh"
#include "BLI_linear_allocator.hh"
#include "BLI_local_allocator.hh"
#include "BLI_vector.hh"
#include <atomic>
@ -98,6 +99,8 @@ struct Context {
* Custom user data that can be used in the function.
*/
UserData *user_data;
LocalAllocator *allocator;
};
/**
@ -276,12 +279,12 @@ class LazyFunction {
* Allocates storage for this function. The storage will be passed to every call to #execute.
* If the function does not keep track of any state, this does not have to be implemented.
*/
virtual void *init_storage(LinearAllocator<> &allocator) const;
virtual void *init_storage(LocalAllocator &allocator) const;
/**
* Destruct the storage created in #init_storage.
*/
virtual void destruct_storage(void *storage) const;
virtual void destruct_storage(void *storage, LocalAllocator &allocator) const;
/**
* Calls `fn` with the input indices that the given `output_index` may depend on. By default

View File

@ -85,14 +85,16 @@ inline void execute_lazy_function_eagerly_impl(
...);
output_usages.fill(ValueUsage::Used);
set_outputs.fill(false);
LinearAllocator<> allocator;
LocalAllocatorSet allocator_set;
LocalAllocator &allocator = allocator_set.local();
Context context;
context.user_data = user_data;
context.storage = fn.init_storage(allocator);
context.allocator = &allocator;
BasicParams params{
fn, input_pointers, output_pointers, input_usages, output_usages, set_outputs};
fn.execute(params, context);
fn.destruct_storage(context.storage);
fn.destruct_storage(context.storage, allocator);
/* Make sure all outputs have been computed. */
BLI_assert(!Span<bool>(set_outputs).contains(false));

View File

@ -59,11 +59,23 @@ class GraphExecutor : public LazyFunction {
using Logger = GraphExecutorLogger;
using SideEffectProvider = GraphExecutorSideEffectProvider;
struct NodeBufferOffsets {
int node;
int inputs;
int outputs;
};
struct PreprocessData {
Array<NodeBufferOffsets> offsets;
int node_state_buffer_size;
};
private:
/**
* The graph that is evaluated.
*/
const Graph &graph_;
const PreprocessData &preprocess_data_;
/**
* Input and output sockets of the entire graph.
*/
@ -85,11 +97,14 @@ class GraphExecutor : public LazyFunction {
GraphExecutor(const Graph &graph,
Span<const OutputSocket *> graph_inputs,
Span<const InputSocket *> graph_outputs,
const PreprocessData &preprocess_data,
const Logger *logger,
const SideEffectProvider *side_effect_provider);
void *init_storage(LinearAllocator<> &allocator) const override;
void destruct_storage(void *storage) const override;
void *init_storage(LocalAllocator &allocator) const override;
void destruct_storage(void *storage, LocalAllocator &allocator) const override;
static void preprocess(const Graph &graph, PreprocessData &r_preprocess_data);
private:
void execute_impl(Params &params, const Context &context) const override;

View File

@ -25,12 +25,12 @@ std::string LazyFunction::output_name(int index) const
return outputs_[index].debug_name;
}
void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const
void *LazyFunction::init_storage(LocalAllocator & /*allocator*/) const
{
return nullptr;
}
void LazyFunction::destruct_storage(void *storage) const
void LazyFunction::destruct_storage(void *storage, LocalAllocator & /*allocator*/) const
{
BLI_assert(storage == nullptr);
UNUSED_VARS_NDEBUG(storage);

View File

@ -75,7 +75,7 @@ enum class NodeScheduleState {
RunningAndRescheduled,
};
struct InputState {
struct alignas(8) InputState {
/**
* Value of this input socket. By default, the value is empty. When other nodes are done
* computing their outputs, the computed values will be forwarded to linked input sockets. The
@ -97,7 +97,7 @@ struct InputState {
bool was_ready_for_execution = false;
};
struct OutputState {
struct alignas(8) OutputState {
/**
* Keeps track of how the output value is used. If a connected input becomes used, this output
* has to become used as well. The output becomes unused when it is used by no input socket
@ -127,7 +127,7 @@ struct OutputState {
void *value = nullptr;
};
struct NodeState {
struct alignas(8) NodeState {
/**
* Needs to be locked when any data in this state is accessed that is not explicitly marked as
* not needing the lock.
@ -271,7 +271,7 @@ class Executor {
/**
* State of every node, indexed by #Node::index_in_graph.
*/
Array<NodeState *> node_states_;
MutableSpan<NodeState *> node_states_;
/**
* Parameters provided by the caller. This is always non-null, while a node is running.
*/
@ -285,15 +285,7 @@ class Executor {
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
std::thread::id current_main_thread_;
#endif
/**
* A separate linear allocator for every thread. We could potentially reuse some memory, but that
* doesn't seem worth it yet.
*/
struct ThreadLocalData {
LinearAllocator<> allocator;
};
std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
LinearAllocator<> main_allocator_;
/**
* Set to false when the first execution ends.
*/
@ -308,18 +300,25 @@ class Executor {
BLI_assert(self_.graph_.node_indices_are_valid());
}
~Executor()
void destruct_state(LocalAllocator &allocator)
{
if (TaskPool *task_pool = task_pool_.load()) {
BLI_task_pool_free(task_pool);
}
threading::parallel_for(node_states_.index_range(), 1024, [&](const IndexRange range) {
LocalAllocator &local_allocator = allocator.local();
for (const int node_index : range) {
const Node &node = *self_.graph_.nodes()[node_index];
NodeState &node_state = *node_states_[node_index];
this->destruct_node_state(node, node_state);
if (!node_state.node_has_finished) {
this->destruct_node_data(node, node_state, local_allocator);
}
std::destroy_at(&node_state);
}
});
allocator.deallocate(
node_states_[0], self_.preprocess_data_.node_state_buffer_size, alignof(NodeState));
allocator.destruct_free_array(node_states_);
}
/**
@ -364,7 +363,7 @@ class Executor {
}
}
this->initialize_static_value_usages(side_effect_nodes);
this->initialize_static_value_usages(side_effect_nodes, this->get_local_allocator());
this->schedule_side_effect_nodes(side_effect_nodes, current_task);
}
@ -382,54 +381,41 @@ class Executor {
void initialize_node_states()
{
Span<const Node *> nodes = self_.graph_.nodes();
node_states_.reinitialize(nodes.size());
node_states_ = context_->allocator->allocate_array<NodeState *>(nodes.size());
auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
for (const int i : range) {
const Node &node = *nodes[i];
NodeState &node_state = *allocator.construct<NodeState>().release();
node_states_[i] = &node_state;
this->construct_initial_node_state(allocator, node, node_state);
}
};
if (nodes.size() <= 256) {
construct_node_range(nodes.index_range(), main_allocator_);
}
else {
this->ensure_thread_locals();
/* Construct all node states in parallel. */
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
LinearAllocator<> &allocator = thread_locals_->local().allocator;
construct_node_range(range, allocator);
});
void *node_states_buffer = context_->allocator->allocate(
self_.preprocess_data_.node_state_buffer_size, alignof(NodeState));
for (const int i : nodes.index_range()) {
const Node &node = *nodes[i];
const GraphExecutor::NodeBufferOffsets &node_offsets = self_.preprocess_data_.offsets[i];
void *state_buffer = POINTER_OFFSET(node_states_buffer, node_offsets.node);
NodeState *node_state = new (state_buffer) NodeState();
node_state->inputs = {
static_cast<InputState *>(POINTER_OFFSET(node_states_buffer, node_offsets.inputs)),
node.inputs().size()};
node_state->outputs = {
static_cast<OutputState *>(POINTER_OFFSET(node_states_buffer, node_offsets.outputs)),
node.outputs().size()};
default_construct_n(node_state->inputs.data(), node_state->inputs.size());
default_construct_n(node_state->outputs.data(), node_state->outputs.size());
node_states_[i] = node_state;
}
}
void construct_initial_node_state(LinearAllocator<> &allocator,
const Node &node,
NodeState &node_state)
{
const Span<const InputSocket *> node_inputs = node.inputs();
const Span<const OutputSocket *> node_outputs = node.outputs();
node_state.inputs = allocator.construct_array<InputState>(node_inputs.size());
node_state.outputs = allocator.construct_array<OutputState>(node_outputs.size());
}
void destruct_node_state(const Node &node, NodeState &node_state)
void destruct_node_data(const Node &node, NodeState &node_state, LocalAllocator &allocator)
{
if (node.is_function()) {
const LazyFunction &fn = static_cast<const FunctionNode &>(node).function();
if (node_state.storage != nullptr) {
fn.destruct_storage(node_state.storage);
fn.destruct_storage(node_state.storage, allocator);
}
}
for (const int i : node.inputs().index_range()) {
InputState &input_state = node_state.inputs[i];
const InputSocket &input_socket = node.input(i);
this->destruct_input_value_if_exists(input_state, input_socket.type());
this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
}
std::destroy_at(&node_state);
}
/**
@ -453,7 +439,7 @@ class Executor {
this->set_input_required(locked_node, socket);
}
else {
this->set_input_unused(locked_node, socket);
this->set_input_unused(locked_node, socket, this->get_local_allocator());
}
});
}
@ -500,13 +486,14 @@ class Executor {
* Most importantly, this function initializes `InputState.usage` and
* `OutputState.potential_target_sockets`.
*/
void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes)
void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes,
LocalAllocator &allocator)
{
const Span<const Node *> all_nodes = self_.graph_.nodes();
/* Used for a search through all nodes that outputs depend on. */
Stack<const Node *> reachable_nodes_to_check;
Array<bool> reachable_node_flags(all_nodes.size(), false);
Stack<const Node *, 16, LocalAllocatorRef> reachable_nodes_to_check{allocator};
Array<bool, 16, LocalAllocatorRef> reachable_node_flags{all_nodes.size(), false, allocator};
/* Graph outputs are always reachable. */
for (const InputSocket *socket : self_.graph_outputs_) {
@ -586,7 +573,7 @@ class Executor {
void forward_newly_provided_inputs(CurrentTask &current_task)
{
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
LocalAllocator &allocator = this->get_local_allocator();
for (const int graph_input_index : self_.graph_inputs_.index_range()) {
std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
if (was_loaded.load()) {
@ -605,7 +592,7 @@ class Executor {
}
void forward_newly_provided_input(CurrentTask &current_task,
LinearAllocator<> &allocator,
LocalAllocator &allocator,
const int graph_input_index,
void *input_data)
{
@ -621,7 +608,6 @@ class Executor {
const Node &node = socket.node();
const int index_in_node = socket.index();
NodeState &node_state = *node_states_[node.index_in_graph()];
OutputState &output_state = node_state.outputs[index_in_node];
/* The notified output socket might be an input of the entire graph. In this case, notify the
* caller that the input is required. */
@ -640,12 +626,13 @@ class Executor {
return;
}
this->forward_newly_provided_input(
current_task, this->get_main_or_local_allocator(), graph_input_index, input_data);
current_task, this->get_local_allocator(), graph_input_index, input_data);
return;
}
BLI_assert(node.is_function());
this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
OutputState &output_state = node_state.outputs[index_in_node];
if (output_state.usage == ValueUsage::Used) {
return;
}
@ -659,9 +646,9 @@ class Executor {
const Node &node = socket.node();
const int index_in_node = socket.index();
NodeState &node_state = *node_states_[node.index_in_graph()];
OutputState &output_state = node_state.outputs[index_in_node];
this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
OutputState &output_state = node_state.outputs[index_in_node];
output_state.potential_target_sockets -= 1;
if (output_state.potential_target_sockets == 0) {
BLI_assert(output_state.usage != ValueUsage::Unused);
@ -760,7 +747,7 @@ class Executor {
void run_node_task(const FunctionNode &node, CurrentTask &current_task)
{
NodeState &node_state = *node_states_[node.index_in_graph()];
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
LocalAllocator &allocator = this->get_local_allocator();
const LazyFunction &fn = node.function();
bool node_needs_execution = false;
@ -799,6 +786,7 @@ class Executor {
node_state.always_used_inputs_requested = true;
}
const bool allow_missing_requested_inputs = fn.allow_missing_requested_inputs();
for (const int input_index : node_state.inputs.index_range()) {
InputState &input_state = node_state.inputs[input_index];
if (input_state.was_ready_for_execution) {
@ -808,7 +796,11 @@ class Executor {
input_state.was_ready_for_execution = true;
continue;
}
if (!fn.allow_missing_requested_inputs()) {
const InputSocket &socket = node.input(input_index);
if (socket.origin() == nullptr) {
continue;
}
if (!allow_missing_requested_inputs) {
if (input_state.usage == ValueUsage::Used) {
return;
}
@ -848,7 +840,7 @@ class Executor {
/* Importantly, the node must not be locked when it is executed. That would result in locks
* being hold very long in some cases and results in multiple locks being hold by the same
* thread in the same graph which can lead to deadlocks. */
this->execute_node(node, node_state, current_task);
this->execute_node(node, node_state, current_task, allocator);
}
this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
@ -857,7 +849,7 @@ class Executor {
this->assert_expected_outputs_have_been_computed(locked_node);
}
#endif
this->finish_node_if_possible(locked_node);
this->finish_node_if_possible(locked_node, allocator);
const bool reschedule_requested = node_state.schedule_state ==
NodeScheduleState::RunningAndRescheduled;
node_state.schedule_state = NodeScheduleState::NotScheduled;
@ -895,7 +887,7 @@ class Executor {
}
}
void finish_node_if_possible(LockedNode &locked_node)
void finish_node_if_possible(LockedNode &locked_node, LocalAllocator &allocator)
{
const Node &node = locked_node.node;
NodeState &node_state = locked_node.node_state;
@ -923,44 +915,44 @@ class Executor {
const InputSocket &input_socket = node.input(input_index);
InputState &input_state = node_state.inputs[input_index];
if (input_state.usage == ValueUsage::Maybe) {
this->set_input_unused(locked_node, input_socket);
}
else if (input_state.usage == ValueUsage::Used) {
this->destruct_input_value_if_exists(input_state, input_socket.type());
this->set_input_unused(locked_node, input_socket, allocator);
}
}
if (node_state.storage != nullptr) {
if (node.is_function()) {
const FunctionNode &fn_node = static_cast<const FunctionNode &>(node);
fn_node.function().destruct_storage(node_state.storage);
}
node_state.storage = nullptr;
}
this->destruct_node_data(node, node_state, allocator);
}
void destruct_input_value_if_exists(InputState &input_state, const CPPType &type)
void destruct_input_value_if_exists(InputState &input_state,
const CPPType &type,
LocalAllocator &allocator)
{
if (input_state.value != nullptr) {
type.destruct(input_state.value);
allocator.deallocate(input_state.value, type.size(), type.alignment());
input_state.value = nullptr;
}
}
void execute_node(const FunctionNode &node, NodeState &node_state, CurrentTask &current_task);
void execute_node(const FunctionNode &node,
NodeState &node_state,
CurrentTask &current_task,
LocalAllocator &allocator);
void set_input_unused_during_execution(const Node &node,
NodeState &node_state,
const int input_index,
CurrentTask &current_task)
{
LocalAllocator &allocator = this->get_local_allocator();
const InputSocket &input_socket = node.input(input_index);
this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
this->set_input_unused(locked_node, input_socket);
this->set_input_unused(locked_node, input_socket, allocator);
});
}
void set_input_unused(LockedNode &locked_node, const InputSocket &input_socket)
void set_input_unused(LockedNode &locked_node,
const InputSocket &input_socket,
LocalAllocator &allocator)
{
NodeState &node_state = locked_node.node_state;
const int input_index = input_socket.index();
@ -972,7 +964,7 @@ class Executor {
}
input_state.usage = ValueUsage::Unused;
this->destruct_input_value_if_exists(input_state, input_socket.type());
this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
if (input_state.was_ready_for_execution) {
return;
}
@ -1026,7 +1018,7 @@ class Executor {
CurrentTask &current_task)
{
BLI_assert(value_to_forward.get() != nullptr);
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
LocalAllocator &allocator = this->get_local_allocator();
const CPPType &type = *value_to_forward.type();
if (self_.logger_ != nullptr) {
@ -1038,17 +1030,7 @@ class Executor {
const Node &target_node = target_socket->node();
NodeState &node_state = *node_states_[target_node.index_in_graph()];
const int input_index = target_socket->index();
InputState &input_state = node_state.inputs[input_index];
const bool is_last_target = target_socket == targets.last();
#ifdef DEBUG
if (input_state.value != nullptr) {
if (self_.logger_ != nullptr) {
self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_);
}
BLI_assert_unreachable();
}
#endif
BLI_assert(!input_state.was_ready_for_execution);
BLI_assert(target_socket->type() == type);
BLI_assert(target_socket->origin() == &from_socket);
@ -1072,6 +1054,18 @@ class Executor {
continue;
}
this->with_locked_node(target_node, node_state, current_task, [&](LockedNode &locked_node) {
InputState &input_state = node_state.inputs[input_index];
#ifdef DEBUG
if (input_state.value != nullptr) {
if (self_.logger_ != nullptr) {
self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_);
}
BLI_assert_unreachable();
}
#endif
BLI_assert(!input_state.was_ready_for_execution);
if (input_state.usage == ValueUsage::Unused) {
return;
}
@ -1089,6 +1083,7 @@ class Executor {
}
if (value_to_forward.get() != nullptr) {
value_to_forward.destruct();
allocator.deallocate(value_to_forward.get(), type.size(), type.alignment());
}
}
@ -1145,23 +1140,10 @@ class Executor {
if (BLI_system_thread_count() <= 1) {
return false;
}
this->ensure_thread_locals();
task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
return true;
}
void ensure_thread_locals()
{
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
if (current_main_thread_ != std::this_thread::get_id()) {
BLI_assert_unreachable();
}
#endif
if (!thread_locals_) {
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
}
}
/**
* Allow other threads to steal all the nodes that are currently scheduled on this thread.
*/
@ -1194,12 +1176,12 @@ class Executor {
[](TaskPool * /*pool*/, void *data) { MEM_delete(static_cast<ScheduledNodes *>(data)); });
}
LinearAllocator<> &get_main_or_local_allocator()
LocalAllocator &get_local_allocator()
{
if (this->use_multi_threading()) {
return thread_locals_->local().allocator;
return context_->allocator->local();
}
return main_allocator_;
return *context_->allocator;
}
};
@ -1248,7 +1230,7 @@ class GraphExecutorLFParams final : public Params {
OutputState &output_state = node_state_.outputs[index];
BLI_assert(!output_state.has_been_computed);
if (output_state.value == nullptr) {
LinearAllocator<> &allocator = executor_.get_main_or_local_allocator();
LocalAllocator &allocator = executor_.get_local_allocator();
const CPPType &type = node_.output(index).type();
output_state.value = allocator.allocate(type.size(), type.alignment());
}
@ -1297,13 +1279,15 @@ class GraphExecutorLFParams final : public Params {
*/
inline void Executor::execute_node(const FunctionNode &node,
NodeState &node_state,
CurrentTask &current_task)
CurrentTask &current_task,
LocalAllocator &allocator)
{
const LazyFunction &fn = node.function();
GraphExecutorLFParams node_params{fn, *this, node, node_state, current_task};
BLI_assert(context_ != nullptr);
Context fn_context = *context_;
fn_context.storage = node_state.storage;
fn_context.allocator = &allocator;
if (self_.logger_ != nullptr) {
self_.logger_->log_before_node_execute(node, node_params, fn_context);
@ -1330,12 +1314,32 @@ inline void Executor::execute_node(const FunctionNode &node,
}
}
void GraphExecutor::preprocess(const Graph &graph, PreprocessData &r_preprocess_data)
{
const Span<const Node *> nodes = graph.nodes();
r_preprocess_data.offsets.reinitialize(nodes.size());
int offset = 0;
for (const int i : nodes.index_range()) {
const Node &node = *nodes[i];
NodeBufferOffsets &node_offsets = r_preprocess_data.offsets[i];
node_offsets.node = offset;
offset += sizeof(NodeState);
node_offsets.inputs = offset;
offset += sizeof(InputState) * node.inputs().size();
node_offsets.outputs = offset;
offset += sizeof(OutputState) * node.outputs().size();
}
r_preprocess_data.node_state_buffer_size = offset;
}
GraphExecutor::GraphExecutor(const Graph &graph,
const Span<const OutputSocket *> graph_inputs,
const Span<const InputSocket *> graph_outputs,
const PreprocessData &preprocess_data,
const Logger *logger,
const SideEffectProvider *side_effect_provider)
: graph_(graph),
preprocess_data_(preprocess_data),
graph_inputs_(graph_inputs),
graph_outputs_(graph_outputs),
logger_(logger),
@ -1360,15 +1364,17 @@ void GraphExecutor::execute_impl(Params &params, const Context &context) const
executor.execute(params, context);
}
void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const
void *GraphExecutor::init_storage(LocalAllocator &allocator) const
{
Executor &executor = *allocator.construct<Executor>(*this).release();
Executor &executor = allocator.allocate_new<Executor>(*this);
return &executor;
}
void GraphExecutor::destruct_storage(void *storage) const
void GraphExecutor::destruct_storage(void *storage, LocalAllocator &allocator) const
{
std::destroy_at(static_cast<Executor *>(storage));
Executor *executor = static_cast<Executor *>(storage);
executor->destruct_state(allocator);
allocator.destruct_free(executor);
}
void GraphExecutorLogger::log_socket_value(const Socket &socket,

View File

@ -105,7 +105,11 @@ TEST(lazy_function, SideEffects)
SimpleSideEffectProvider side_effect_provider{{&store_node}};
GraphExecutor executor_fn{graph, {&input_node.output(0)}, {}, nullptr, &side_effect_provider};
GraphExecutor::PreprocessData preprocess_data;
GraphExecutor::preprocess(graph, preprocess_data);
GraphExecutor executor_fn{
graph, {&input_node.output(0)}, {}, preprocess_data, nullptr, &side_effect_provider};
execute_lazy_function_eagerly(executor_fn, nullptr, std::make_tuple(5), std::make_tuple());
EXPECT_EQ(dst1, 15);
@ -167,8 +171,11 @@ TEST(lazy_function, GraphWithCycle)
graph.update_node_indices();
GraphExecutor::PreprocessData preprocess_data;
GraphExecutor::preprocess(graph, preprocess_data);
GraphExecutor executor_fn{
graph, {&input_node.output(0)}, {&output_node.input(0)}, nullptr, nullptr};
graph, {&input_node.output(0)}, {&output_node.input(0)}, preprocess_data, nullptr, nullptr};
int result = 0;
execute_lazy_function_eagerly(
executor_fn, nullptr, std::make_tuple(10), std::make_tuple(&result));

View File

@ -1146,8 +1146,12 @@ static GeometrySet compute_geometry(
blender::nodes::GeometryNodesLazyFunctionLogger lf_logger(lf_graph_info);
blender::nodes::GeometryNodesLazyFunctionSideEffectProvider lf_side_effect_provider;
lf::GraphExecutor graph_executor{
lf_graph_info.graph, graph_inputs, graph_outputs, &lf_logger, &lf_side_effect_provider};
lf::GraphExecutor graph_executor{lf_graph_info.graph,
graph_inputs,
graph_outputs,
lf_graph_info.graph_preprocess_data,
&lf_logger,
&lf_side_effect_provider};
blender::nodes::GeoNodesModifierData geo_nodes_modifier_data;
geo_nodes_modifier_data.depsgraph = ctx->depsgraph;
@ -1169,7 +1173,9 @@ static GeometrySet compute_geometry(
blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
user_data.compute_context = &modifier_compute_context;
blender::LinearAllocator<> allocator;
blender::LocalAllocatorSet allocator_set;
blender::LocalAllocator &allocator = allocator_set.local();
Vector<GMutablePointer> inputs_to_destruct;
int input_index = -1;
@ -1212,6 +1218,7 @@ static GeometrySet compute_geometry(
lf::Context lf_context;
lf_context.storage = graph_executor.init_storage(allocator);
lf_context.user_data = &user_data;
lf_context.allocator = &allocator;
lf::BasicParams lf_params{graph_executor,
param_inputs,
param_outputs,
@ -1219,7 +1226,7 @@ static GeometrySet compute_geometry(
param_output_usages,
param_set_outputs};
graph_executor.execute(lf_params, lf_context);
graph_executor.destruct_storage(lf_context.storage);
graph_executor.destruct_storage(lf_context.storage, allocator);
for (GMutablePointer &ptr : inputs_to_destruct) {
ptr.destruct();
@ -1289,6 +1296,7 @@ static void modifyGeometry(ModifierData *md,
const ModifierEvalContext *ctx,
GeometrySet &geometry_set)
{
SCOPED_TIMER_AVERAGED(__func__);
NodesModifierData *nmd = reinterpret_cast<NodesModifierData *>(md);
if (nmd->node_group == nullptr) {
return;

View File

@ -187,6 +187,7 @@ struct GeometryNodesLazyFunctionGraphInfo {
* Mappings between the lazy-function graph and the #bNodeTree.
*/
GeometryNodeLazyFunctionGraphMapping mapping;
lf::GraphExecutor::PreprocessData graph_preprocess_data;
/**
* Approximate number of nodes in the graph if all sub-graphs were inlined.
* This can be used as a simple heuristic for the complexity of the node group.

View File

@ -769,6 +769,7 @@ class LazyFunctionForGroupNode : public LazyFunction {
graph_executor_.emplace(lf_graph_info.graph,
std::move(graph_inputs),
std::move(graph_outputs),
lf_graph_info.graph_preprocess_data,
&*lf_logger_,
&*lf_side_effect_provider_);
}
@ -805,18 +806,18 @@ class LazyFunctionForGroupNode : public LazyFunction {
graph_executor_->execute(params, group_context);
}
void *init_storage(LinearAllocator<> &allocator) const override
void *init_storage(LocalAllocator &allocator) const override
{
Storage *s = allocator.construct<Storage>().release();
s->graph_executor_storage = graph_executor_->init_storage(allocator);
return s;
Storage &s = allocator.allocate_new<Storage>();
s.graph_executor_storage = graph_executor_->init_storage(allocator);
return &s;
}
void destruct_storage(void *storage) const override
void destruct_storage(void *storage, LocalAllocator &allocator) const override
{
Storage *s = static_cast<Storage *>(storage);
graph_executor_->destruct_storage(s->graph_executor_storage);
std::destroy_at(s);
graph_executor_->destruct_storage(s->graph_executor_storage, allocator);
allocator.destruct_free(s);
}
std::string name() const override
@ -1243,6 +1244,7 @@ struct GeometryNodesLazyFunctionGraphBuilder {
lf_graph_->update_node_indices();
lf_graph_info_->num_inline_nodes_approximate += lf_graph_->nodes().size();
lf::GraphExecutor::preprocess(*lf_graph_, lf_graph_info_->graph_preprocess_data);
}
private: