WIP: Functions: new local allocator for better memory reuse and performance #104630

Draft
Jacques Lucke wants to merge 44 commits from JacquesLucke/blender:local-allocator into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
10 changed files with 185 additions and 25 deletions
Showing only changes of commit 3d3c4216b3 - Show all commits

View File

@ -0,0 +1,132 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma once
#include <cstddef>
#include "BLI_allocator.hh"
#include "BLI_asan.h"
#include "BLI_map.hh"
#include "BLI_math_bits.h"
#include "BLI_stack.hh"
#include "BLI_utility_mixins.hh"
#include "BLI_vector.hh"
namespace blender {
class LocalPoolScope {
};
template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, NonMovable {
private:
static constexpr int64_t s_alignment = 64;
const LocalPoolScope &pool_scope_;
Vector<MutableSpan<std::byte>> owned_buffers_;
struct BufferStack {
int64_t element_size = -1;
Stack<void *, 0> stack;
};
std::array<BufferStack, 8> small_stacks_;
std::unique_ptr<Map<int, BufferStack>> large_stacks_;
BLI_NO_UNIQUE_ADDRESS Allocator allocator_;
public:
LocalPool(const LocalPoolScope &pool_scope) : pool_scope_(pool_scope)
{
for (const int64_t i : IndexRange(small_stacks_.size())) {
small_stacks_[i].element_size = 8 * (i + 1);
}
}
~LocalPool()
{
for (MutableSpan<std::byte> buffer : owned_buffers_) {
BLI_asan_unpoison(buffer.data(), buffer.size());
allocator_.deallocate(buffer.data());
}
}
void *allocate(const int64_t size, const int64_t alignment)
{
BLI_assert((size == 0 || alignment <= size) && alignment <= s_alignment);
BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
if (!buffer_stack.stack.is_empty()) {
void *buffer = buffer_stack.stack.pop();
BLI_asan_unpoison(buffer, size);
return buffer;
}
if (size <= 4096) {
const int64_t allocation_size = std::clamp<int64_t>(
buffer_stack.element_size * 16, 512, 4096);
void *buffer = allocator_.allocate(allocation_size, s_alignment, __func__);
BLI_asan_poison(buffer, allocation_size);
const int64_t num = allocation_size / buffer_stack.element_size;
for (int64_t i = num - 1; i > 0; i--) {
buffer_stack.stack.push(POINTER_OFFSET(buffer, buffer_stack.element_size * i));
}
owned_buffers_.append({static_cast<std::byte *>(buffer), allocation_size});
BLI_asan_unpoison(buffer, size);
return buffer;
}
void *buffer = allocator_.allocate(
size_t(size), std::max<size_t>(s_alignment, size_t(alignment)), __func__);
owned_buffers_.append({static_cast<std::byte *>(buffer), size});
return buffer;
}
void deallocate(const void *buffer, const int64_t size, const int64_t alignment)
{
BLI_assert((size == 0 || alignment <= size) && alignment <= s_alignment);
#ifdef DEBUG
memset(buffer, -1, size);
#endif
BLI_asan_poison(buffer, alignment);
BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
buffer_stack.stack.push(buffer);
}
template<typename T, typename... Args> destruct_ptr<T> construct(Args &&...args)
{
void *buffer = this->allocate(sizeof(T), alignof(T));
T *value = new (buffer) T(std::forward<Args>(args)...);
return destruct_ptr<T>(value);
}
template<typename T> MutableSpan<T> allocate_array(int64_t size)
{
T *array = static_cast<T *>(this->allocate(sizeof(T) * size, alignof(T)));
return MutableSpan<T>(array, size);
}
template<typename T, typename... Args>
MutableSpan<T> construct_array(int64_t size, Args &&...args)
{
MutableSpan<T> array = this->allocate_array<T>(size);
for (const int64_t i : IndexRange(size)) {
new (&array[i]) T(std::forward<Args>(args)...);
}
return array;
}
private:
BufferStack &get_buffer_stack(const int64_t size, const int64_t /*alignment*/)
{
if (size <= 64) {
return small_stacks_[(size - (size != 0)) >> 3];
}
if (!large_stacks_) {
large_stacks_ = std::make_unique<Map<int, BufferStack>>();
}
const int key = bitscan_reverse_uint64(uint64_t(size));
return large_stacks_->lookup_or_add_cb(key, [&]() {
BufferStack buffer_stack;
buffer_stack.element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
return buffer_stack;
});
}
};
} // namespace blender

View File

@ -255,6 +255,7 @@ set(SRC
BLI_linklist_stack.h
BLI_listbase.h
BLI_listbase_wrapper.hh
BLI_local_pool.hh
BLI_map.hh
BLI_map_slots.hh
BLI_math.h
@ -479,6 +480,7 @@ if(WITH_GTESTS)
tests/BLI_linear_allocator_test.cc
tests/BLI_linklist_lockfree_test.cc
tests/BLI_listbase_test.cc
tests/BLI_local_pool_test.cc
tests/BLI_map_test.cc
tests/BLI_math_base_safe_test.cc
tests/BLI_math_base_test.cc

View File

@ -0,0 +1,18 @@
/* SPDX-License-Identifier: Apache-2.0 */
#include "BLI_local_pool.hh"
#include "BLI_strict_flags.h"
#include "testing/testing.h"
namespace blender::tests {
TEST(local_pool, Test)
{
LocalPoolScope pool_scope;
LocalPool pool(pool_scope);
std::cout << pool.allocate(30000, 8) << "\n";
}
} // namespace blender::tests

View File

@ -42,6 +42,7 @@
#include "BLI_function_ref.hh"
#include "BLI_generic_pointer.hh"
#include "BLI_linear_allocator.hh"
#include "BLI_local_pool.hh"
#include "BLI_vector.hh"
#include <atomic>
@ -98,6 +99,8 @@ struct Context {
* Custom user data that can be used in the function.
*/
UserData *user_data;
LocalPool<> *local_pool = nullptr;
};
/**
@ -276,7 +279,7 @@ class LazyFunction {
* Allocates storage for this function. The storage will be passed to every call to #execute.
* If the function does not keep track of any state, this does not have to be implemented.
*/
virtual void *init_storage(LinearAllocator<> &allocator) const;
virtual void *init_storage(LocalPool<> &allocator) const;
/**
* Destruct the storage created in #init_storage.

View File

@ -85,7 +85,8 @@ inline void execute_lazy_function_eagerly_impl(
...);
output_usages.fill(ValueUsage::Used);
set_outputs.fill(false);
LinearAllocator<> allocator;
LocalPoolScope local_pool_scope;
LocalPool<> allocator(local_pool_scope);
Context context;
context.user_data = user_data;
context.storage = fn.init_storage(allocator);

View File

@ -88,7 +88,7 @@ class GraphExecutor : public LazyFunction {
const Logger *logger,
const SideEffectProvider *side_effect_provider);
void *init_storage(LinearAllocator<> &allocator) const override;
void *init_storage(LocalPool<> &allocator) const override;
void destruct_storage(void *storage) const override;
private:

View File

@ -25,7 +25,7 @@ std::string LazyFunction::output_name(int index) const
return outputs_[index].debug_name;
}
void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const
void *LazyFunction::init_storage(LocalPool<> & /*allocator*/) const
{
return nullptr;
}

View File

@ -241,15 +241,16 @@ class Executor {
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
std::thread::id current_main_thread_;
#endif
/**
* A separate linear allocator for every thread. We could potentially reuse some memory, but that
* doesn't seem worth it yet.
*/
LocalPoolScope local_pool_scope_;
struct ThreadLocalData {
LinearAllocator<> allocator;
LocalPool<> local_pool;
ThreadLocalData(const LocalPoolScope &local_pool_scope) : local_pool(local_pool_scope)
{
}
};
std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
LinearAllocator<> main_allocator_;
LocalPool<> main_allocator_;
/**
* Set to false when the first execution ends.
*/
@ -258,7 +259,8 @@ class Executor {
friend GraphExecutorLFParams;
public:
Executor(const GraphExecutor &self) : self_(self), loaded_inputs_(self.graph_inputs_.size())
Executor(const GraphExecutor &self)
: self_(self), loaded_inputs_(self.graph_inputs_.size()), main_allocator_(local_pool_scope_)
{
/* The indices are necessary, because they are used as keys in #node_states_. */
BLI_assert(self_.graph_.node_indices_are_valid());
@ -340,7 +342,7 @@ class Executor {
Span<const Node *> nodes = self_.graph_.nodes();
node_states_.reinitialize(nodes.size());
auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
auto construct_node_range = [&](const IndexRange range, LocalPool<> &allocator) {
for (const int i : range) {
const Node &node = *nodes[i];
NodeState &node_state = *allocator.construct<NodeState>().release();
@ -355,13 +357,13 @@ class Executor {
this->ensure_thread_locals();
/* Construct all node states in parallel. */
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
LocalPool<> &allocator = this->get_main_or_local_allocator();
construct_node_range(range, allocator);
});
}
}
void construct_initial_node_state(LinearAllocator<> &allocator,
void construct_initial_node_state(LocalPool<> &allocator,
const Node &node,
NodeState &node_state)
{
@ -533,7 +535,7 @@ class Executor {
void forward_newly_provided_inputs(CurrentTask &current_task)
{
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
LocalPool<> &allocator = this->get_main_or_local_allocator();
for (const int graph_input_index : self_.graph_inputs_.index_range()) {
std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
if (was_loaded.load()) {
@ -552,7 +554,7 @@ class Executor {
}
void forward_newly_provided_input(CurrentTask &current_task,
LinearAllocator<> &allocator,
LocalPool<> &allocator,
const int graph_input_index,
void *input_data)
{
@ -706,7 +708,7 @@ class Executor {
void run_node_task(const FunctionNode &node, CurrentTask &current_task)
{
NodeState &node_state = *node_states_[node.index_in_graph()];
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
LocalPool<> &allocator = this->get_main_or_local_allocator();
const LazyFunction &fn = node.function();
bool node_needs_execution = false;
@ -965,7 +967,7 @@ class Executor {
CurrentTask &current_task)
{
BLI_assert(value_to_forward.get() != nullptr);
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
LocalPool<> &allocator = this->get_main_or_local_allocator();
const CPPType &type = *value_to_forward.type();
if (self_.logger_ != nullptr) {
@ -1091,7 +1093,8 @@ class Executor {
}
#endif
if (!thread_locals_) {
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>(
[scope = &local_pool_scope_]() { return ThreadLocalData{*scope}; });
}
}
@ -1130,10 +1133,10 @@ class Executor {
});
}
LinearAllocator<> &get_main_or_local_allocator()
LocalPool<> &get_main_or_local_allocator()
{
if (this->use_multi_threading()) {
return thread_locals_->local().allocator;
return thread_locals_->local().local_pool;
}
return main_allocator_;
}
@ -1184,7 +1187,7 @@ class GraphExecutorLFParams final : public Params {
OutputState &output_state = node_state_.outputs[index];
BLI_assert(!output_state.has_been_computed);
if (output_state.value == nullptr) {
LinearAllocator<> &allocator = executor_.get_main_or_local_allocator();
LocalPool<> &allocator = executor_.get_main_or_local_allocator();
const CPPType &type = node_.output(index).type();
output_state.value = allocator.allocate(type.size(), type.alignment());
}
@ -1296,7 +1299,7 @@ void GraphExecutor::execute_impl(Params &params, const Context &context) const
executor.execute(params, context);
}
void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const
void *GraphExecutor::init_storage(LocalPool<> &allocator) const
{
Executor &executor = *allocator.construct<Executor>(*this).release();
return &executor;

View File

@ -1163,7 +1163,8 @@ static GeometrySet compute_geometry(
blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
user_data.compute_context = &modifier_compute_context;
blender::LinearAllocator<> allocator;
blender::LocalPoolScope local_pool_scope;
blender::LocalPool<> allocator(local_pool_scope);
Vector<GMutablePointer> inputs_to_destruct;
int input_index;

View File

@ -689,7 +689,7 @@ class LazyFunctionForGroupNode : public LazyFunction {
graph_executor_->execute(params, group_context);
}
void *init_storage(LinearAllocator<> &allocator) const override
void *init_storage(LocalPool<> &allocator) const override
{
Storage *s = allocator.construct<Storage>().release();
s->graph_executor_storage = graph_executor_->init_storage(allocator);