WIP: Functions: new local allocator for better memory reuse and performance #104630
|
@ -0,0 +1,132 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
|
||||||
|
#include "BLI_allocator.hh"
|
||||||
|
#include "BLI_asan.h"
|
||||||
|
#include "BLI_map.hh"
|
||||||
|
#include "BLI_math_bits.h"
|
||||||
|
#include "BLI_stack.hh"
|
||||||
|
#include "BLI_utility_mixins.hh"
|
||||||
|
#include "BLI_vector.hh"
|
||||||
|
|
||||||
|
namespace blender {
|
||||||
|
|
||||||
|
class LocalPoolScope {
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, NonMovable {
|
||||||
|
private:
|
||||||
|
static constexpr int64_t s_alignment = 64;
|
||||||
|
|
||||||
|
const LocalPoolScope &pool_scope_;
|
||||||
|
Vector<MutableSpan<std::byte>> owned_buffers_;
|
||||||
|
|
||||||
|
struct BufferStack {
|
||||||
|
int64_t element_size = -1;
|
||||||
|
Stack<void *, 0> stack;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::array<BufferStack, 8> small_stacks_;
|
||||||
|
std::unique_ptr<Map<int, BufferStack>> large_stacks_;
|
||||||
|
|
||||||
|
BLI_NO_UNIQUE_ADDRESS Allocator allocator_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
LocalPool(const LocalPoolScope &pool_scope) : pool_scope_(pool_scope)
|
||||||
|
{
|
||||||
|
for (const int64_t i : IndexRange(small_stacks_.size())) {
|
||||||
|
small_stacks_[i].element_size = 8 * (i + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~LocalPool()
|
||||||
|
{
|
||||||
|
for (MutableSpan<std::byte> buffer : owned_buffers_) {
|
||||||
|
BLI_asan_unpoison(buffer.data(), buffer.size());
|
||||||
|
allocator_.deallocate(buffer.data());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void *allocate(const int64_t size, const int64_t alignment)
|
||||||
|
{
|
||||||
|
BLI_assert((size == 0 || alignment <= size) && alignment <= s_alignment);
|
||||||
|
BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
|
||||||
|
if (!buffer_stack.stack.is_empty()) {
|
||||||
|
void *buffer = buffer_stack.stack.pop();
|
||||||
|
BLI_asan_unpoison(buffer, size);
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
if (size <= 4096) {
|
||||||
|
const int64_t allocation_size = std::clamp<int64_t>(
|
||||||
|
buffer_stack.element_size * 16, 512, 4096);
|
||||||
|
void *buffer = allocator_.allocate(allocation_size, s_alignment, __func__);
|
||||||
|
BLI_asan_poison(buffer, allocation_size);
|
||||||
|
const int64_t num = allocation_size / buffer_stack.element_size;
|
||||||
|
for (int64_t i = num - 1; i > 0; i--) {
|
||||||
|
buffer_stack.stack.push(POINTER_OFFSET(buffer, buffer_stack.element_size * i));
|
||||||
|
}
|
||||||
|
owned_buffers_.append({static_cast<std::byte *>(buffer), allocation_size});
|
||||||
|
BLI_asan_unpoison(buffer, size);
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
void *buffer = allocator_.allocate(
|
||||||
|
size_t(size), std::max<size_t>(s_alignment, size_t(alignment)), __func__);
|
||||||
|
owned_buffers_.append({static_cast<std::byte *>(buffer), size});
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void deallocate(const void *buffer, const int64_t size, const int64_t alignment)
|
||||||
|
{
|
||||||
|
BLI_assert((size == 0 || alignment <= size) && alignment <= s_alignment);
|
||||||
|
#ifdef DEBUG
|
||||||
|
memset(buffer, -1, size);
|
||||||
|
#endif
|
||||||
|
BLI_asan_poison(buffer, alignment);
|
||||||
|
BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
|
||||||
|
buffer_stack.stack.push(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, typename... Args> destruct_ptr<T> construct(Args &&...args)
|
||||||
|
{
|
||||||
|
void *buffer = this->allocate(sizeof(T), alignof(T));
|
||||||
|
T *value = new (buffer) T(std::forward<Args>(args)...);
|
||||||
|
return destruct_ptr<T>(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T> MutableSpan<T> allocate_array(int64_t size)
|
||||||
|
{
|
||||||
|
T *array = static_cast<T *>(this->allocate(sizeof(T) * size, alignof(T)));
|
||||||
|
return MutableSpan<T>(array, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, typename... Args>
|
||||||
|
MutableSpan<T> construct_array(int64_t size, Args &&...args)
|
||||||
|
{
|
||||||
|
MutableSpan<T> array = this->allocate_array<T>(size);
|
||||||
|
for (const int64_t i : IndexRange(size)) {
|
||||||
|
new (&array[i]) T(std::forward<Args>(args)...);
|
||||||
|
}
|
||||||
|
return array;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
BufferStack &get_buffer_stack(const int64_t size, const int64_t /*alignment*/)
|
||||||
|
{
|
||||||
|
if (size <= 64) {
|
||||||
|
return small_stacks_[(size - (size != 0)) >> 3];
|
||||||
|
}
|
||||||
|
if (!large_stacks_) {
|
||||||
|
large_stacks_ = std::make_unique<Map<int, BufferStack>>();
|
||||||
|
}
|
||||||
|
const int key = bitscan_reverse_uint64(uint64_t(size));
|
||||||
|
return large_stacks_->lookup_or_add_cb(key, [&]() {
|
||||||
|
BufferStack buffer_stack;
|
||||||
|
buffer_stack.element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
|
||||||
|
return buffer_stack;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace blender
|
|
@ -255,6 +255,7 @@ set(SRC
|
||||||
BLI_linklist_stack.h
|
BLI_linklist_stack.h
|
||||||
BLI_listbase.h
|
BLI_listbase.h
|
||||||
BLI_listbase_wrapper.hh
|
BLI_listbase_wrapper.hh
|
||||||
|
BLI_local_pool.hh
|
||||||
BLI_map.hh
|
BLI_map.hh
|
||||||
BLI_map_slots.hh
|
BLI_map_slots.hh
|
||||||
BLI_math.h
|
BLI_math.h
|
||||||
|
@ -479,6 +480,7 @@ if(WITH_GTESTS)
|
||||||
tests/BLI_linear_allocator_test.cc
|
tests/BLI_linear_allocator_test.cc
|
||||||
tests/BLI_linklist_lockfree_test.cc
|
tests/BLI_linklist_lockfree_test.cc
|
||||||
tests/BLI_listbase_test.cc
|
tests/BLI_listbase_test.cc
|
||||||
|
tests/BLI_local_pool_test.cc
|
||||||
tests/BLI_map_test.cc
|
tests/BLI_map_test.cc
|
||||||
tests/BLI_math_base_safe_test.cc
|
tests/BLI_math_base_safe_test.cc
|
||||||
tests/BLI_math_base_test.cc
|
tests/BLI_math_base_test.cc
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
/* SPDX-License-Identifier: Apache-2.0 */
|
||||||
|
|
||||||
|
#include "BLI_local_pool.hh"
|
||||||
|
#include "BLI_strict_flags.h"
|
||||||
|
|
||||||
|
#include "testing/testing.h"
|
||||||
|
|
||||||
|
namespace blender::tests {
|
||||||
|
|
||||||
|
TEST(local_pool, Test)
|
||||||
|
{
|
||||||
|
LocalPoolScope pool_scope;
|
||||||
|
LocalPool pool(pool_scope);
|
||||||
|
|
||||||
|
std::cout << pool.allocate(30000, 8) << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace blender::tests
|
|
@ -42,6 +42,7 @@
|
||||||
#include "BLI_function_ref.hh"
|
#include "BLI_function_ref.hh"
|
||||||
#include "BLI_generic_pointer.hh"
|
#include "BLI_generic_pointer.hh"
|
||||||
#include "BLI_linear_allocator.hh"
|
#include "BLI_linear_allocator.hh"
|
||||||
|
#include "BLI_local_pool.hh"
|
||||||
#include "BLI_vector.hh"
|
#include "BLI_vector.hh"
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
@ -98,6 +99,8 @@ struct Context {
|
||||||
* Custom user data that can be used in the function.
|
* Custom user data that can be used in the function.
|
||||||
*/
|
*/
|
||||||
UserData *user_data;
|
UserData *user_data;
|
||||||
|
|
||||||
|
LocalPool<> *local_pool = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -276,7 +279,7 @@ class LazyFunction {
|
||||||
* Allocates storage for this function. The storage will be passed to every call to #execute.
|
* Allocates storage for this function. The storage will be passed to every call to #execute.
|
||||||
* If the function does not keep track of any state, this does not have to be implemented.
|
* If the function does not keep track of any state, this does not have to be implemented.
|
||||||
*/
|
*/
|
||||||
virtual void *init_storage(LinearAllocator<> &allocator) const;
|
virtual void *init_storage(LocalPool<> &allocator) const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Destruct the storage created in #init_storage.
|
* Destruct the storage created in #init_storage.
|
||||||
|
|
|
@ -85,7 +85,8 @@ inline void execute_lazy_function_eagerly_impl(
|
||||||
...);
|
...);
|
||||||
output_usages.fill(ValueUsage::Used);
|
output_usages.fill(ValueUsage::Used);
|
||||||
set_outputs.fill(false);
|
set_outputs.fill(false);
|
||||||
LinearAllocator<> allocator;
|
LocalPoolScope local_pool_scope;
|
||||||
|
LocalPool<> allocator(local_pool_scope);
|
||||||
Context context;
|
Context context;
|
||||||
context.user_data = user_data;
|
context.user_data = user_data;
|
||||||
context.storage = fn.init_storage(allocator);
|
context.storage = fn.init_storage(allocator);
|
||||||
|
|
|
@ -88,7 +88,7 @@ class GraphExecutor : public LazyFunction {
|
||||||
const Logger *logger,
|
const Logger *logger,
|
||||||
const SideEffectProvider *side_effect_provider);
|
const SideEffectProvider *side_effect_provider);
|
||||||
|
|
||||||
void *init_storage(LinearAllocator<> &allocator) const override;
|
void *init_storage(LocalPool<> &allocator) const override;
|
||||||
void destruct_storage(void *storage) const override;
|
void destruct_storage(void *storage) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -25,7 +25,7 @@ std::string LazyFunction::output_name(int index) const
|
||||||
return outputs_[index].debug_name;
|
return outputs_[index].debug_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const
|
void *LazyFunction::init_storage(LocalPool<> & /*allocator*/) const
|
||||||
{
|
{
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
|
@ -241,15 +241,16 @@ class Executor {
|
||||||
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
|
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
|
||||||
std::thread::id current_main_thread_;
|
std::thread::id current_main_thread_;
|
||||||
#endif
|
#endif
|
||||||
/**
|
LocalPoolScope local_pool_scope_;
|
||||||
* A separate linear allocator for every thread. We could potentially reuse some memory, but that
|
|
||||||
* doesn't seem worth it yet.
|
|
||||||
*/
|
|
||||||
struct ThreadLocalData {
|
struct ThreadLocalData {
|
||||||
LinearAllocator<> allocator;
|
LocalPool<> local_pool;
|
||||||
|
|
||||||
|
ThreadLocalData(const LocalPoolScope &local_pool_scope) : local_pool(local_pool_scope)
|
||||||
|
{
|
||||||
|
}
|
||||||
};
|
};
|
||||||
std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
|
std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
|
||||||
LinearAllocator<> main_allocator_;
|
LocalPool<> main_allocator_;
|
||||||
/**
|
/**
|
||||||
* Set to false when the first execution ends.
|
* Set to false when the first execution ends.
|
||||||
*/
|
*/
|
||||||
|
@ -258,7 +259,8 @@ class Executor {
|
||||||
friend GraphExecutorLFParams;
|
friend GraphExecutorLFParams;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Executor(const GraphExecutor &self) : self_(self), loaded_inputs_(self.graph_inputs_.size())
|
Executor(const GraphExecutor &self)
|
||||||
|
: self_(self), loaded_inputs_(self.graph_inputs_.size()), main_allocator_(local_pool_scope_)
|
||||||
{
|
{
|
||||||
/* The indices are necessary, because they are used as keys in #node_states_. */
|
/* The indices are necessary, because they are used as keys in #node_states_. */
|
||||||
BLI_assert(self_.graph_.node_indices_are_valid());
|
BLI_assert(self_.graph_.node_indices_are_valid());
|
||||||
|
@ -340,7 +342,7 @@ class Executor {
|
||||||
Span<const Node *> nodes = self_.graph_.nodes();
|
Span<const Node *> nodes = self_.graph_.nodes();
|
||||||
node_states_.reinitialize(nodes.size());
|
node_states_.reinitialize(nodes.size());
|
||||||
|
|
||||||
auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
|
auto construct_node_range = [&](const IndexRange range, LocalPool<> &allocator) {
|
||||||
for (const int i : range) {
|
for (const int i : range) {
|
||||||
const Node &node = *nodes[i];
|
const Node &node = *nodes[i];
|
||||||
NodeState &node_state = *allocator.construct<NodeState>().release();
|
NodeState &node_state = *allocator.construct<NodeState>().release();
|
||||||
|
@ -355,13 +357,13 @@ class Executor {
|
||||||
this->ensure_thread_locals();
|
this->ensure_thread_locals();
|
||||||
/* Construct all node states in parallel. */
|
/* Construct all node states in parallel. */
|
||||||
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
|
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
|
||||||
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
|
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||||
construct_node_range(range, allocator);
|
construct_node_range(range, allocator);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void construct_initial_node_state(LinearAllocator<> &allocator,
|
void construct_initial_node_state(LocalPool<> &allocator,
|
||||||
const Node &node,
|
const Node &node,
|
||||||
NodeState &node_state)
|
NodeState &node_state)
|
||||||
{
|
{
|
||||||
|
@ -533,7 +535,7 @@ class Executor {
|
||||||
|
|
||||||
void forward_newly_provided_inputs(CurrentTask ¤t_task)
|
void forward_newly_provided_inputs(CurrentTask ¤t_task)
|
||||||
{
|
{
|
||||||
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
|
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||||
for (const int graph_input_index : self_.graph_inputs_.index_range()) {
|
for (const int graph_input_index : self_.graph_inputs_.index_range()) {
|
||||||
std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
|
std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
|
||||||
if (was_loaded.load()) {
|
if (was_loaded.load()) {
|
||||||
|
@ -552,7 +554,7 @@ class Executor {
|
||||||
}
|
}
|
||||||
|
|
||||||
void forward_newly_provided_input(CurrentTask ¤t_task,
|
void forward_newly_provided_input(CurrentTask ¤t_task,
|
||||||
LinearAllocator<> &allocator,
|
LocalPool<> &allocator,
|
||||||
const int graph_input_index,
|
const int graph_input_index,
|
||||||
void *input_data)
|
void *input_data)
|
||||||
{
|
{
|
||||||
|
@ -706,7 +708,7 @@ class Executor {
|
||||||
void run_node_task(const FunctionNode &node, CurrentTask ¤t_task)
|
void run_node_task(const FunctionNode &node, CurrentTask ¤t_task)
|
||||||
{
|
{
|
||||||
NodeState &node_state = *node_states_[node.index_in_graph()];
|
NodeState &node_state = *node_states_[node.index_in_graph()];
|
||||||
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
|
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||||
const LazyFunction &fn = node.function();
|
const LazyFunction &fn = node.function();
|
||||||
|
|
||||||
bool node_needs_execution = false;
|
bool node_needs_execution = false;
|
||||||
|
@ -965,7 +967,7 @@ class Executor {
|
||||||
CurrentTask ¤t_task)
|
CurrentTask ¤t_task)
|
||||||
{
|
{
|
||||||
BLI_assert(value_to_forward.get() != nullptr);
|
BLI_assert(value_to_forward.get() != nullptr);
|
||||||
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
|
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||||
const CPPType &type = *value_to_forward.type();
|
const CPPType &type = *value_to_forward.type();
|
||||||
|
|
||||||
if (self_.logger_ != nullptr) {
|
if (self_.logger_ != nullptr) {
|
||||||
|
@ -1091,7 +1093,8 @@ class Executor {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (!thread_locals_) {
|
if (!thread_locals_) {
|
||||||
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
|
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>(
|
||||||
|
[scope = &local_pool_scope_]() { return ThreadLocalData{*scope}; });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1130,10 +1133,10 @@ class Executor {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
LinearAllocator<> &get_main_or_local_allocator()
|
LocalPool<> &get_main_or_local_allocator()
|
||||||
{
|
{
|
||||||
if (this->use_multi_threading()) {
|
if (this->use_multi_threading()) {
|
||||||
return thread_locals_->local().allocator;
|
return thread_locals_->local().local_pool;
|
||||||
}
|
}
|
||||||
return main_allocator_;
|
return main_allocator_;
|
||||||
}
|
}
|
||||||
|
@ -1184,7 +1187,7 @@ class GraphExecutorLFParams final : public Params {
|
||||||
OutputState &output_state = node_state_.outputs[index];
|
OutputState &output_state = node_state_.outputs[index];
|
||||||
BLI_assert(!output_state.has_been_computed);
|
BLI_assert(!output_state.has_been_computed);
|
||||||
if (output_state.value == nullptr) {
|
if (output_state.value == nullptr) {
|
||||||
LinearAllocator<> &allocator = executor_.get_main_or_local_allocator();
|
LocalPool<> &allocator = executor_.get_main_or_local_allocator();
|
||||||
const CPPType &type = node_.output(index).type();
|
const CPPType &type = node_.output(index).type();
|
||||||
output_state.value = allocator.allocate(type.size(), type.alignment());
|
output_state.value = allocator.allocate(type.size(), type.alignment());
|
||||||
}
|
}
|
||||||
|
@ -1296,7 +1299,7 @@ void GraphExecutor::execute_impl(Params ¶ms, const Context &context) const
|
||||||
executor.execute(params, context);
|
executor.execute(params, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const
|
void *GraphExecutor::init_storage(LocalPool<> &allocator) const
|
||||||
{
|
{
|
||||||
Executor &executor = *allocator.construct<Executor>(*this).release();
|
Executor &executor = *allocator.construct<Executor>(*this).release();
|
||||||
return &executor;
|
return &executor;
|
||||||
|
|
|
@ -1163,7 +1163,8 @@ static GeometrySet compute_geometry(
|
||||||
blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
|
blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
|
||||||
user_data.compute_context = &modifier_compute_context;
|
user_data.compute_context = &modifier_compute_context;
|
||||||
|
|
||||||
blender::LinearAllocator<> allocator;
|
blender::LocalPoolScope local_pool_scope;
|
||||||
|
blender::LocalPool<> allocator(local_pool_scope);
|
||||||
Vector<GMutablePointer> inputs_to_destruct;
|
Vector<GMutablePointer> inputs_to_destruct;
|
||||||
|
|
||||||
int input_index;
|
int input_index;
|
||||||
|
|
|
@ -689,7 +689,7 @@ class LazyFunctionForGroupNode : public LazyFunction {
|
||||||
graph_executor_->execute(params, group_context);
|
graph_executor_->execute(params, group_context);
|
||||||
}
|
}
|
||||||
|
|
||||||
void *init_storage(LinearAllocator<> &allocator) const override
|
void *init_storage(LocalPool<> &allocator) const override
|
||||||
{
|
{
|
||||||
Storage *s = allocator.construct<Storage>().release();
|
Storage *s = allocator.construct<Storage>().release();
|
||||||
s->graph_executor_storage = graph_executor_->init_storage(allocator);
|
s->graph_executor_storage = graph_executor_->init_storage(allocator);
|
||||||
|
|
Loading…
Reference in New Issue