WIP: Functions: new local allocator for better memory reuse and performance #104630

Draft
Jacques Lucke wants to merge 44 commits from JacquesLucke/blender:local-allocator into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
10 changed files with 85 additions and 246 deletions
Showing only changes of commit e4491302fc - Show all commits

View File

@ -35,9 +35,10 @@ class LocalAllocator : NonCopyable, NonMovable {
friend LocalAllocatorSet;
LocalAllocator(LocalAllocatorSet &owner_set);
~LocalAllocator();
public:
~LocalAllocator();
bool is_local() const;
LocalAllocator &local();
@ -45,8 +46,10 @@ class LocalAllocator : NonCopyable, NonMovable {
void deallocate(const void *buffer, const int64_t size, const int64_t alignment);
template<typename T, typename... Args> T &allocate_new(Args &&...args);
template<typename T, typename... Args> void destruct_free(const T &value);
template<typename T> MutableSpan<T> allocate_new_array(const int64_t size);
template<typename T, typename... Args> void destruct_free(const T *value);
template<typename T> MutableSpan<T> allocate_array(const int64_t size);
template<typename T, typename... Args>
MutableSpan<T> allocate_new_array(const int64_t size, Args &&...args);
template<typename T> void destruct_free_array(Span<T> data);
template<typename T> void destruct_free_array(MutableSpan<T> data);
@ -142,13 +145,13 @@ template<typename T, typename... Args> inline T &LocalAllocator::allocate_new(Ar
return *value;
}
template<typename T, typename... Args> inline void LocalAllocator::destruct_free(const T &value)
template<typename T, typename... Args> inline void LocalAllocator::destruct_free(const T *value)
{
std::destroy_at(value);
this->deallocate(&value, sizeof(T), alignof(T));
this->deallocate(value, sizeof(T), alignof(T));
}
template<typename T> MutableSpan<T> inline LocalAllocator::allocate_new_array(const int64_t size)
template<typename T> MutableSpan<T> inline LocalAllocator::allocate_array(const int64_t size)
{
if (size == 0) {
return {};
@ -157,6 +160,16 @@ template<typename T> MutableSpan<T> inline LocalAllocator::allocate_new_array(co
return {static_cast<T *>(buffer), size};
}
template<typename T, typename... Args>
MutableSpan<T> inline LocalAllocator::allocate_new_array(const int64_t size, Args &&...args)
{
MutableSpan<T> array = this->allocate_array<T>(size);
for (const int64_t i : IndexRange(size)) {
new (&array[i]) T(std::forward<Args>(args)...);
}
return array;
}
template<typename T> inline void LocalAllocator::destruct_free_array(Span<T> data)
{
if (data.is_empty()) {

View File

@ -1,166 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma once
#include <cstddef>
#include "BLI_allocator.hh"
#include "BLI_asan.h"
#include "BLI_enumerable_thread_specific.hh"
#include "BLI_linear_allocator.hh"
#include "BLI_map.hh"
#include "BLI_math_bits.h"
#include "BLI_stack.hh"
#include "BLI_utility_mixins.hh"
#include "BLI_vector.hh"
namespace blender {
template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, NonMovable {
private:
static constexpr int64_t s_alignment = 64;
LinearAllocator<> linear_allocator_;
struct BufferStack {
int64_t element_size = -1;
int64_t min_alignment = -1;
Stack<void *, 0> stack;
};
std::array<BufferStack, 8> small_stacks_;
std::unique_ptr<Map<int, BufferStack>> large_stacks_;
public:
LocalPool()
{
for (const int64_t i : IndexRange(small_stacks_.size())) {
BufferStack &buffer_stack = small_stacks_[i];
buffer_stack.element_size = 8 * (i + 1);
buffer_stack.min_alignment = power_of_2_min_u(buffer_stack.element_size);
}
}
~LocalPool()
{
}
void *allocate(const int64_t size, const int64_t alignment)
{
BLI_assert(size > 0);
BLI_assert(alignment <= size && alignment <= s_alignment);
BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
BLI_assert(buffer_stack.element_size >= size);
BLI_assert(buffer_stack.min_alignment >= alignment);
void *buffer;
if (!buffer_stack.stack.is_empty()) {
buffer = buffer_stack.stack.pop();
BLI_asan_unpoison(buffer, size);
}
else {
buffer = linear_allocator_.allocate(buffer_stack.element_size, buffer_stack.min_alignment);
}
return buffer;
}
void deallocate(const void *buffer, const int64_t size, const int64_t alignment)
{
BLI_assert(size > 0);
BLI_assert(alignment <= size && alignment <= s_alignment);
#ifdef DEBUG
memset(const_cast<void *>(buffer), -1, size);
#endif
BLI_asan_poison(buffer, size);
BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
BLI_assert(buffer_stack.element_size >= size);
buffer_stack.stack.push(const_cast<void *>(buffer));
}
template<typename T, typename... Args> destruct_ptr<T> construct(Args &&...args)
{
void *buffer = this->allocate(sizeof(T), alignof(T));
T *value = new (buffer) T(std::forward<Args>(args)...);
return destruct_ptr<T>(value);
}
template<typename T> MutableSpan<T> allocate_array(int64_t size)
{
if (size == 0) {
return {};
}
T *array = static_cast<T *>(this->allocate(sizeof(T) * size, alignof(T)));
return MutableSpan<T>(array, size);
}
template<typename T, typename... Args>
MutableSpan<T> construct_array(int64_t size, Args &&...args)
{
MutableSpan<T> array = this->allocate_array<T>(size);
for (const int64_t i : IndexRange(size)) {
new (&array[i]) T(std::forward<Args>(args)...);
}
return array;
}
template<typename T> void destruct_array(Span<T> data)
{
if (data.is_empty()) {
return;
}
destruct_n(const_cast<T *>(data.data()), data.size());
this->deallocate(data.data(), data.size() * sizeof(T), alignof(T));
}
template<typename T> void destruct_array(MutableSpan<T> data)
{
this->destruct_array(data.as_span());
}
template<typename T> void destruct(const T *value)
{
std::destroy_at(value);
this->deallocate(value, sizeof(T), alignof(T));
}
private:
BufferStack &get_buffer_stack(const int64_t size, const int64_t /*alignment*/)
{
if (size <= 64) {
return small_stacks_[(size - 1) >> 3];
}
if (!large_stacks_) {
large_stacks_ = std::make_unique<Map<int, BufferStack>>();
}
const int key = bitscan_reverse_uint64(uint64_t(size));
return large_stacks_->lookup_or_add_cb(key, [&]() {
BufferStack buffer_stack;
buffer_stack.element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
buffer_stack.min_alignment = s_alignment;
return buffer_stack;
});
}
};
class LocalMemoryPools {
private:
threading::EnumerableThreadSpecific<LocalPool<>> pool_by_thread_;
public:
~LocalMemoryPools()
{
}
LocalPool<> &local()
{
return pool_by_thread_.local();
}
};
struct Pools {
LocalMemoryPools *pools = nullptr;
LocalPool<> *local = nullptr;
};
} // namespace blender

View File

@ -90,6 +90,7 @@ set(SRC
intern/lazy_threading.cc
intern/length_parameterize.cc
intern/listbase.cc
intern/local_allocator.cc
intern/math_base.c
intern/math_base_inline.c
intern/math_base_safe_inline.c
@ -256,7 +257,6 @@ set(SRC
BLI_listbase.h
BLI_listbase_wrapper.hh
BLI_local_allocator.hh
BLI_local_pool.hh
BLI_map.hh
BLI_map_slots.hh
BLI_math.h

View File

@ -42,7 +42,7 @@
#include "BLI_function_ref.hh"
#include "BLI_generic_pointer.hh"
#include "BLI_linear_allocator.hh"
#include "BLI_local_pool.hh"
#include "BLI_local_allocator.hh"
#include "BLI_vector.hh"
#include <atomic>
@ -100,7 +100,7 @@ struct Context {
*/
UserData *user_data;
Pools pools;
LocalAllocator *allocator;
};
/**
@ -279,12 +279,12 @@ class LazyFunction {
* Allocates storage for this function. The storage will be passed to every call to #execute.
* If the function does not keep track of any state, this does not have to be implemented.
*/
virtual void *init_storage(Pools &pools) const;
virtual void *init_storage(LocalAllocator &allocator) const;
/**
* Destruct the storage created in #init_storage.
*/
virtual void destruct_storage(void *storage, Pools &pools) const;
virtual void destruct_storage(void *storage, LocalAllocator &allocator) const;
/**
* Calls `fn` with the input indices that the given `output_index` may depend on. By default

View File

@ -85,16 +85,16 @@ inline void execute_lazy_function_eagerly_impl(
...);
output_usages.fill(ValueUsage::Used);
set_outputs.fill(false);
LocalMemoryPools local_pools;
Pools pools{&local_pools, &local_pools.local()};
LocalAllocatorSet allocator_set;
LocalAllocator &allocator = allocator_set.local();
Context context;
context.user_data = user_data;
context.storage = fn.init_storage(pools);
context.pools = pools;
context.storage = fn.init_storage(allocator);
context.allocator = &allocator;
BasicParams params{
fn, input_pointers, output_pointers, input_usages, output_usages, set_outputs};
fn.execute(params, context);
fn.destruct_storage(context.storage, pools);
fn.destruct_storage(context.storage, allocator);
/* Make sure all outputs have been computed. */
BLI_assert(!Span<bool>(set_outputs).contains(false));

View File

@ -88,8 +88,8 @@ class GraphExecutor : public LazyFunction {
const Logger *logger,
const SideEffectProvider *side_effect_provider);
void *init_storage(Pools &pools) const override;
void destruct_storage(void *storage, Pools &pools) const override;
void *init_storage(LocalAllocator &allocator) const override;
void destruct_storage(void *storage, LocalAllocator &allocator) const override;
private:
void execute_impl(Params &params, const Context &context) const override;

View File

@ -25,12 +25,12 @@ std::string LazyFunction::output_name(int index) const
return outputs_[index].debug_name;
}
void *LazyFunction::init_storage(Pools & /*pools*/) const
void *LazyFunction::init_storage(LocalAllocator & /*allocator*/) const
{
return nullptr;
}
void LazyFunction::destruct_storage(void *storage, Pools & /*pools*/) const
void LazyFunction::destruct_storage(void *storage, LocalAllocator & /*allocator*/) const
{
BLI_assert(storage == nullptr);
UNUSED_VARS_NDEBUG(storage);

View File

@ -262,22 +262,20 @@ class Executor {
BLI_assert(self_.graph_.node_indices_are_valid());
}
void destruct_state(Pools &pools)
void destruct_state(LocalAllocator &allocator)
{
if (TaskPool *task_pool = task_pool_.load()) {
BLI_task_pool_free(task_pool);
}
threading::parallel_for(node_states_.index_range(), 1024, [&](const IndexRange range) {
LocalPool<> &local = (range.size() == node_states_.size()) ? *pools.local :
pools.pools->local();
LocalAllocator &local_allocator = allocator.local();
for (const int node_index : range) {
const Node &node = *self_.graph_.nodes()[node_index];
NodeState &node_state = node_states_[node_index];
Pools sub_pools = {pools.pools, &local};
this->destruct_node_state(node, node_state, sub_pools);
this->destruct_node_state(node, node_state, local_allocator);
}
});
pools.local->destruct_array(node_states_);
allocator.destruct_free_array(node_states_);
}
/**
@ -340,45 +338,44 @@ class Executor {
void initialize_node_states()
{
Span<const Node *> nodes = self_.graph_.nodes();
node_states_ = context_->pools.local->construct_array<NodeState>(nodes.size());
node_states_ = context_->allocator->allocate_new_array<NodeState>(nodes.size());
/* Construct all node states in parallel. */
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
LocalPool<> &allocator = (range.size() == nodes.size()) ? *context_->pools.local :
this->get_local_allocator();
LocalAllocator &local_allocator = context_->allocator->local();
for (const int i : range) {
const Node &node = *nodes[i];
this->construct_initial_node_state(allocator, node, node_states_[i]);
this->construct_initial_node_state(local_allocator, node, node_states_[i]);
}
});
}
void construct_initial_node_state(LocalPool<> &allocator,
void construct_initial_node_state(LocalAllocator &allocator,
const Node &node,
NodeState &node_state)
{
const Span<const InputSocket *> node_inputs = node.inputs();
const Span<const OutputSocket *> node_outputs = node.outputs();
node_state.inputs = allocator.construct_array<InputState>(node_inputs.size());
node_state.outputs = allocator.construct_array<OutputState>(node_outputs.size());
node_state.inputs = allocator.allocate_new_array<InputState>(node_inputs.size());
node_state.outputs = allocator.allocate_new_array<OutputState>(node_outputs.size());
}
void destruct_node_state(const Node &node, NodeState &node_state, Pools &pools)
void destruct_node_state(const Node &node, NodeState &node_state, LocalAllocator &allocator)
{
if (node.is_function()) {
const LazyFunction &fn = static_cast<const FunctionNode &>(node).function();
if (node_state.storage != nullptr) {
fn.destruct_storage(node_state.storage, pools);
fn.destruct_storage(node_state.storage, allocator);
}
}
for (const int i : node.inputs().index_range()) {
InputState &input_state = node_state.inputs[i];
const InputSocket &input_socket = node.input(i);
this->destruct_input_value_if_exists(input_state, input_socket.type(), *pools.local);
this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
}
pools.local->destruct_array(node_state.inputs);
pools.local->destruct_array(node_state.outputs);
allocator.destruct_free_array(node_state.inputs);
allocator.destruct_free_array(node_state.outputs);
}
void schedule_newly_requested_outputs(CurrentTask &current_task)
@ -441,14 +438,14 @@ class Executor {
* `OutputState.potential_target_sockets`.
*/
void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes,
LocalPool<> &allocator)
LocalAllocator &allocator)
{
const Span<const Node *> all_nodes = self_.graph_.nodes();
/* Used for a search through all nodes that outputs depend on. */
Stack<const Node *, 100> reachable_nodes_to_check;
MutableSpan<bool> reachable_node_flags = allocator.allocate_array<bool>(all_nodes.size());
BLI_SCOPED_DEFER([&]() { allocator.destruct_array(reachable_node_flags); });
MutableSpan<bool> reachable_node_flags = allocator.allocate_new_array<bool>(all_nodes.size());
BLI_SCOPED_DEFER([&]() { allocator.destruct_free_array(reachable_node_flags); });
reachable_node_flags.fill(false);
/* Graph outputs are always reachable. */
@ -529,7 +526,7 @@ class Executor {
void forward_newly_provided_inputs(CurrentTask &current_task)
{
LocalPool<> &allocator = this->get_local_allocator();
LocalAllocator &allocator = this->get_local_allocator();
for (const int graph_input_index : self_.graph_inputs_.index_range()) {
std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
if (was_loaded.load()) {
@ -548,7 +545,7 @@ class Executor {
}
void forward_newly_provided_input(CurrentTask &current_task,
LocalPool<> &allocator,
LocalAllocator &allocator,
const int graph_input_index,
void *input_data)
{
@ -702,7 +699,7 @@ class Executor {
void run_node_task(const FunctionNode &node, CurrentTask &current_task)
{
NodeState &node_state = node_states_[node.index_in_graph()];
LocalPool<> &allocator = this->get_local_allocator();
LocalAllocator &allocator = this->get_local_allocator();
const LazyFunction &fn = node.function();
bool node_needs_execution = false;
@ -768,8 +765,7 @@ class Executor {
if (node_needs_execution) {
if (!node_state.storage_and_defaults_initialized) {
/* Initialize storage. */
Pools pools{context_->pools.pools, &allocator};
node_state.storage = fn.init_storage(pools);
node_state.storage = fn.init_storage(allocator);
/* Load unlinked inputs. */
for (const int input_index : node.inputs().index_range()) {
@ -843,7 +839,7 @@ class Executor {
}
}
void finish_node_if_possible(LockedNode &locked_node, LocalPool<> &allocator)
void finish_node_if_possible(LockedNode &locked_node, LocalAllocator &allocator)
{
const Node &node = locked_node.node;
NodeState &node_state = locked_node.node_state;
@ -881,8 +877,7 @@ class Executor {
if (node_state.storage != nullptr) {
if (node.is_function()) {
const FunctionNode &fn_node = static_cast<const FunctionNode &>(node);
Pools pools{context_->pools.pools, &allocator};
fn_node.function().destruct_storage(node_state.storage, pools);
fn_node.function().destruct_storage(node_state.storage, allocator);
}
node_state.storage = nullptr;
}
@ -890,7 +885,7 @@ class Executor {
void destruct_input_value_if_exists(InputState &input_state,
const CPPType &type,
LocalPool<> &allocator)
LocalAllocator &allocator)
{
if (input_state.value != nullptr) {
type.destruct(input_state.value);
@ -902,14 +897,14 @@ class Executor {
void execute_node(const FunctionNode &node,
NodeState &node_state,
CurrentTask &current_task,
LocalPool<> &allocator);
LocalAllocator &allocator);
void set_input_unused_during_execution(const Node &node,
NodeState &node_state,
const int input_index,
CurrentTask &current_task)
{
LocalPool<> &allocator = this->get_local_allocator();
LocalAllocator &allocator = this->get_local_allocator();
const InputSocket &input_socket = node.input(input_index);
this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
this->set_input_unused(locked_node, input_socket, allocator);
@ -918,7 +913,7 @@ class Executor {
void set_input_unused(LockedNode &locked_node,
const InputSocket &input_socket,
LocalPool<> &allocator)
LocalAllocator &allocator)
{
NodeState &node_state = locked_node.node_state;
const int input_index = input_socket.index();
@ -984,7 +979,7 @@ class Executor {
CurrentTask &current_task)
{
BLI_assert(value_to_forward.get() != nullptr);
LocalPool<> &allocator = this->get_local_allocator();
LocalAllocator &allocator = this->get_local_allocator();
const CPPType &type = *value_to_forward.type();
if (self_.logger_ != nullptr) {
@ -1143,12 +1138,12 @@ class Executor {
});
}
LocalPool<> &get_local_allocator()
LocalAllocator &get_local_allocator()
{
if (this->use_multi_threading()) {
return context_->pools.pools->local();
return context_->allocator->local();
}
return *context_->pools.local;
return *context_->allocator;
}
};
@ -1197,7 +1192,7 @@ class GraphExecutorLFParams final : public Params {
OutputState &output_state = node_state_.outputs[index];
BLI_assert(!output_state.has_been_computed);
if (output_state.value == nullptr) {
LocalPool<> &allocator = executor_.get_local_allocator();
LocalAllocator &allocator = executor_.get_local_allocator();
const CPPType &type = node_.output(index).type();
output_state.value = allocator.allocate(type.size(), type.alignment());
}
@ -1247,14 +1242,14 @@ class GraphExecutorLFParams final : public Params {
inline void Executor::execute_node(const FunctionNode &node,
NodeState &node_state,
CurrentTask &current_task,
LocalPool<> &allocator)
LocalAllocator &allocator)
{
const LazyFunction &fn = node.function();
GraphExecutorLFParams node_params{fn, *this, node, node_state, current_task};
BLI_assert(context_ != nullptr);
Context fn_context = *context_;
fn_context.storage = node_state.storage;
fn_context.pools.local = &allocator;
fn_context.allocator = &allocator;
if (self_.logger_ != nullptr) {
self_.logger_->log_before_node_execute(node, node_params, fn_context);
@ -1311,17 +1306,17 @@ void GraphExecutor::execute_impl(Params &params, const Context &context) const
executor.execute(params, context);
}
void *GraphExecutor::init_storage(Pools &pools) const
void *GraphExecutor::init_storage(LocalAllocator &allocator) const
{
Executor &executor = *pools.local->construct<Executor>(*this).release();
Executor &executor = allocator.allocate_new<Executor>(*this);
return &executor;
}
void GraphExecutor::destruct_storage(void *storage, Pools &pools) const
void GraphExecutor::destruct_storage(void *storage, LocalAllocator &allocator) const
{
Executor *executor = static_cast<Executor *>(storage);
executor->destruct_state(pools);
pools.local->destruct(executor);
executor->destruct_state(allocator);
allocator.destruct_free(executor);
}
void GraphExecutorLogger::log_socket_value(const Socket &socket,

View File

@ -1162,11 +1162,8 @@ static GeometrySet compute_geometry(
blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
user_data.compute_context = &modifier_compute_context;
blender::LocalMemoryPools local_pools;
blender::Pools pools;
pools.pools = &local_pools;
pools.local = &local_pools.local();
blender::LocalPool<> &allocator = *pools.local;
blender::LocalAllocatorSet allocator_set;
blender::LocalAllocator &allocator = allocator_set.local();
Vector<GMutablePointer> inputs_to_destruct;
@ -1208,9 +1205,9 @@ static GeometrySet compute_geometry(
}
lf::Context lf_context;
lf_context.storage = graph_executor.init_storage(pools);
lf_context.storage = graph_executor.init_storage(allocator);
lf_context.user_data = &user_data;
lf_context.pools = pools;
lf_context.allocator = &allocator;
lf::BasicParams lf_params{graph_executor,
param_inputs,
param_outputs,
@ -1218,7 +1215,7 @@ static GeometrySet compute_geometry(
param_output_usages,
param_set_outputs};
graph_executor.execute(lf_params, lf_context);
graph_executor.destruct_storage(lf_context.storage, pools);
graph_executor.destruct_storage(lf_context.storage, allocator);
for (GMutablePointer &ptr : inputs_to_destruct) {
ptr.destruct();

View File

@ -806,18 +806,18 @@ class LazyFunctionForGroupNode : public LazyFunction {
graph_executor_->execute(params, group_context);
}
void *init_storage(Pools &pools) const override
void *init_storage(LocalAllocator &allocator) const override
{
Storage *s = pools.local->construct<Storage>().release();
s->graph_executor_storage = graph_executor_->init_storage(pools);
return s;
Storage &s = allocator.allocate_new<Storage>();
s.graph_executor_storage = graph_executor_->init_storage(allocator);
return &s;
}
void destruct_storage(void *storage, Pools &pools) const override
void destruct_storage(void *storage, LocalAllocator &allocator) const override
{
Storage *s = static_cast<Storage *>(storage);
graph_executor_->destruct_storage(s->graph_executor_storage, pools);
pools.local->destruct(s);
graph_executor_->destruct_storage(s->graph_executor_storage, allocator);
allocator.destruct_free(s);
}
std::string name() const override