WIP: Functions: new local allocator for better memory reuse and performance #104630
|
@ -0,0 +1,132 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#include "BLI_allocator.hh"
|
||||
#include "BLI_asan.h"
|
||||
#include "BLI_map.hh"
|
||||
#include "BLI_math_bits.h"
|
||||
#include "BLI_stack.hh"
|
||||
#include "BLI_utility_mixins.hh"
|
||||
#include "BLI_vector.hh"
|
||||
|
||||
namespace blender {
|
||||
|
||||
class LocalPoolScope {
|
||||
};
|
||||
|
||||
template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, NonMovable {
|
||||
private:
|
||||
static constexpr int64_t s_alignment = 64;
|
||||
|
||||
const LocalPoolScope &pool_scope_;
|
||||
Vector<MutableSpan<std::byte>> owned_buffers_;
|
||||
|
||||
struct BufferStack {
|
||||
int64_t element_size = -1;
|
||||
Stack<void *, 0> stack;
|
||||
};
|
||||
|
||||
std::array<BufferStack, 8> small_stacks_;
|
||||
std::unique_ptr<Map<int, BufferStack>> large_stacks_;
|
||||
|
||||
BLI_NO_UNIQUE_ADDRESS Allocator allocator_;
|
||||
|
||||
public:
|
||||
LocalPool(const LocalPoolScope &pool_scope) : pool_scope_(pool_scope)
|
||||
{
|
||||
for (const int64_t i : IndexRange(small_stacks_.size())) {
|
||||
small_stacks_[i].element_size = 8 * (i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
~LocalPool()
|
||||
{
|
||||
for (MutableSpan<std::byte> buffer : owned_buffers_) {
|
||||
BLI_asan_unpoison(buffer.data(), buffer.size());
|
||||
allocator_.deallocate(buffer.data());
|
||||
}
|
||||
}
|
||||
|
||||
void *allocate(const int64_t size, const int64_t alignment)
|
||||
{
|
||||
BLI_assert((size == 0 || alignment <= size) && alignment <= s_alignment);
|
||||
BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
|
||||
if (!buffer_stack.stack.is_empty()) {
|
||||
void *buffer = buffer_stack.stack.pop();
|
||||
BLI_asan_unpoison(buffer, size);
|
||||
return buffer;
|
||||
}
|
||||
if (size <= 4096) {
|
||||
const int64_t allocation_size = std::clamp<int64_t>(
|
||||
buffer_stack.element_size * 16, 512, 4096);
|
||||
void *buffer = allocator_.allocate(allocation_size, s_alignment, __func__);
|
||||
BLI_asan_poison(buffer, allocation_size);
|
||||
const int64_t num = allocation_size / buffer_stack.element_size;
|
||||
for (int64_t i = num - 1; i > 0; i--) {
|
||||
buffer_stack.stack.push(POINTER_OFFSET(buffer, buffer_stack.element_size * i));
|
||||
}
|
||||
owned_buffers_.append({static_cast<std::byte *>(buffer), allocation_size});
|
||||
BLI_asan_unpoison(buffer, size);
|
||||
return buffer;
|
||||
}
|
||||
void *buffer = allocator_.allocate(
|
||||
size_t(size), std::max<size_t>(s_alignment, size_t(alignment)), __func__);
|
||||
owned_buffers_.append({static_cast<std::byte *>(buffer), size});
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void deallocate(const void *buffer, const int64_t size, const int64_t alignment)
|
||||
{
|
||||
BLI_assert((size == 0 || alignment <= size) && alignment <= s_alignment);
|
||||
#ifdef DEBUG
|
||||
memset(buffer, -1, size);
|
||||
#endif
|
||||
BLI_asan_poison(buffer, alignment);
|
||||
BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
|
||||
buffer_stack.stack.push(buffer);
|
||||
}
|
||||
|
||||
template<typename T, typename... Args> destruct_ptr<T> construct(Args &&...args)
|
||||
{
|
||||
void *buffer = this->allocate(sizeof(T), alignof(T));
|
||||
T *value = new (buffer) T(std::forward<Args>(args)...);
|
||||
return destruct_ptr<T>(value);
|
||||
}
|
||||
|
||||
template<typename T> MutableSpan<T> allocate_array(int64_t size)
|
||||
{
|
||||
T *array = static_cast<T *>(this->allocate(sizeof(T) * size, alignof(T)));
|
||||
return MutableSpan<T>(array, size);
|
||||
}
|
||||
|
||||
template<typename T, typename... Args>
|
||||
MutableSpan<T> construct_array(int64_t size, Args &&...args)
|
||||
{
|
||||
MutableSpan<T> array = this->allocate_array<T>(size);
|
||||
for (const int64_t i : IndexRange(size)) {
|
||||
new (&array[i]) T(std::forward<Args>(args)...);
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
private:
|
||||
BufferStack &get_buffer_stack(const int64_t size, const int64_t /*alignment*/)
|
||||
{
|
||||
if (size <= 64) {
|
||||
return small_stacks_[(size - (size != 0)) >> 3];
|
||||
}
|
||||
if (!large_stacks_) {
|
||||
large_stacks_ = std::make_unique<Map<int, BufferStack>>();
|
||||
}
|
||||
const int key = bitscan_reverse_uint64(uint64_t(size));
|
||||
return large_stacks_->lookup_or_add_cb(key, [&]() {
|
||||
BufferStack buffer_stack;
|
||||
buffer_stack.element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
|
||||
return buffer_stack;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace blender
|
|
@ -255,6 +255,7 @@ set(SRC
|
|||
BLI_linklist_stack.h
|
||||
BLI_listbase.h
|
||||
BLI_listbase_wrapper.hh
|
||||
BLI_local_pool.hh
|
||||
BLI_map.hh
|
||||
BLI_map_slots.hh
|
||||
BLI_math.h
|
||||
|
@ -479,6 +480,7 @@ if(WITH_GTESTS)
|
|||
tests/BLI_linear_allocator_test.cc
|
||||
tests/BLI_linklist_lockfree_test.cc
|
||||
tests/BLI_listbase_test.cc
|
||||
tests/BLI_local_pool_test.cc
|
||||
tests/BLI_map_test.cc
|
||||
tests/BLI_math_base_safe_test.cc
|
||||
tests/BLI_math_base_test.cc
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0 */
|
||||
|
||||
#include "BLI_local_pool.hh"
|
||||
#include "BLI_strict_flags.h"
|
||||
|
||||
#include "testing/testing.h"
|
||||
|
||||
namespace blender::tests {
|
||||
|
||||
TEST(local_pool, Test)
|
||||
{
|
||||
LocalPoolScope pool_scope;
|
||||
LocalPool pool(pool_scope);
|
||||
|
||||
std::cout << pool.allocate(30000, 8) << "\n";
|
||||
}
|
||||
|
||||
} // namespace blender::tests
|
|
@ -42,6 +42,7 @@
|
|||
#include "BLI_function_ref.hh"
|
||||
#include "BLI_generic_pointer.hh"
|
||||
#include "BLI_linear_allocator.hh"
|
||||
#include "BLI_local_pool.hh"
|
||||
#include "BLI_vector.hh"
|
||||
|
||||
#include <atomic>
|
||||
|
@ -98,6 +99,8 @@ struct Context {
|
|||
* Custom user data that can be used in the function.
|
||||
*/
|
||||
UserData *user_data;
|
||||
|
||||
LocalPool<> *local_pool = nullptr;
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -276,7 +279,7 @@ class LazyFunction {
|
|||
* Allocates storage for this function. The storage will be passed to every call to #execute.
|
||||
* If the function does not keep track of any state, this does not have to be implemented.
|
||||
*/
|
||||
virtual void *init_storage(LinearAllocator<> &allocator) const;
|
||||
virtual void *init_storage(LocalPool<> &allocator) const;
|
||||
|
||||
/**
|
||||
* Destruct the storage created in #init_storage.
|
||||
|
|
|
@ -85,7 +85,8 @@ inline void execute_lazy_function_eagerly_impl(
|
|||
...);
|
||||
output_usages.fill(ValueUsage::Used);
|
||||
set_outputs.fill(false);
|
||||
LinearAllocator<> allocator;
|
||||
LocalPoolScope local_pool_scope;
|
||||
LocalPool<> allocator(local_pool_scope);
|
||||
Context context;
|
||||
context.user_data = user_data;
|
||||
context.storage = fn.init_storage(allocator);
|
||||
|
|
|
@ -88,7 +88,7 @@ class GraphExecutor : public LazyFunction {
|
|||
const Logger *logger,
|
||||
const SideEffectProvider *side_effect_provider);
|
||||
|
||||
void *init_storage(LinearAllocator<> &allocator) const override;
|
||||
void *init_storage(LocalPool<> &allocator) const override;
|
||||
void destruct_storage(void *storage) const override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -25,7 +25,7 @@ std::string LazyFunction::output_name(int index) const
|
|||
return outputs_[index].debug_name;
|
||||
}
|
||||
|
||||
void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const
|
||||
void *LazyFunction::init_storage(LocalPool<> & /*allocator*/) const
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -241,15 +241,16 @@ class Executor {
|
|||
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
|
||||
std::thread::id current_main_thread_;
|
||||
#endif
|
||||
/**
|
||||
* A separate linear allocator for every thread. We could potentially reuse some memory, but that
|
||||
* doesn't seem worth it yet.
|
||||
*/
|
||||
LocalPoolScope local_pool_scope_;
|
||||
struct ThreadLocalData {
|
||||
LinearAllocator<> allocator;
|
||||
LocalPool<> local_pool;
|
||||
|
||||
ThreadLocalData(const LocalPoolScope &local_pool_scope) : local_pool(local_pool_scope)
|
||||
{
|
||||
}
|
||||
};
|
||||
std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
|
||||
LinearAllocator<> main_allocator_;
|
||||
LocalPool<> main_allocator_;
|
||||
/**
|
||||
* Set to false when the first execution ends.
|
||||
*/
|
||||
|
@ -258,7 +259,8 @@ class Executor {
|
|||
friend GraphExecutorLFParams;
|
||||
|
||||
public:
|
||||
Executor(const GraphExecutor &self) : self_(self), loaded_inputs_(self.graph_inputs_.size())
|
||||
Executor(const GraphExecutor &self)
|
||||
: self_(self), loaded_inputs_(self.graph_inputs_.size()), main_allocator_(local_pool_scope_)
|
||||
{
|
||||
/* The indices are necessary, because they are used as keys in #node_states_. */
|
||||
BLI_assert(self_.graph_.node_indices_are_valid());
|
||||
|
@ -340,7 +342,7 @@ class Executor {
|
|||
Span<const Node *> nodes = self_.graph_.nodes();
|
||||
node_states_.reinitialize(nodes.size());
|
||||
|
||||
auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
|
||||
auto construct_node_range = [&](const IndexRange range, LocalPool<> &allocator) {
|
||||
for (const int i : range) {
|
||||
const Node &node = *nodes[i];
|
||||
NodeState &node_state = *allocator.construct<NodeState>().release();
|
||||
|
@ -355,13 +357,13 @@ class Executor {
|
|||
this->ensure_thread_locals();
|
||||
/* Construct all node states in parallel. */
|
||||
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
|
||||
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||
construct_node_range(range, allocator);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void construct_initial_node_state(LinearAllocator<> &allocator,
|
||||
void construct_initial_node_state(LocalPool<> &allocator,
|
||||
const Node &node,
|
||||
NodeState &node_state)
|
||||
{
|
||||
|
@ -533,7 +535,7 @@ class Executor {
|
|||
|
||||
void forward_newly_provided_inputs(CurrentTask ¤t_task)
|
||||
{
|
||||
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||
for (const int graph_input_index : self_.graph_inputs_.index_range()) {
|
||||
std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
|
||||
if (was_loaded.load()) {
|
||||
|
@ -552,7 +554,7 @@ class Executor {
|
|||
}
|
||||
|
||||
void forward_newly_provided_input(CurrentTask ¤t_task,
|
||||
LinearAllocator<> &allocator,
|
||||
LocalPool<> &allocator,
|
||||
const int graph_input_index,
|
||||
void *input_data)
|
||||
{
|
||||
|
@ -706,7 +708,7 @@ class Executor {
|
|||
void run_node_task(const FunctionNode &node, CurrentTask ¤t_task)
|
||||
{
|
||||
NodeState &node_state = *node_states_[node.index_in_graph()];
|
||||
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||
const LazyFunction &fn = node.function();
|
||||
|
||||
bool node_needs_execution = false;
|
||||
|
@ -965,7 +967,7 @@ class Executor {
|
|||
CurrentTask ¤t_task)
|
||||
{
|
||||
BLI_assert(value_to_forward.get() != nullptr);
|
||||
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = this->get_main_or_local_allocator();
|
||||
const CPPType &type = *value_to_forward.type();
|
||||
|
||||
if (self_.logger_ != nullptr) {
|
||||
|
@ -1091,7 +1093,8 @@ class Executor {
|
|||
}
|
||||
#endif
|
||||
if (!thread_locals_) {
|
||||
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
|
||||
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>(
|
||||
[scope = &local_pool_scope_]() { return ThreadLocalData{*scope}; });
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1130,10 +1133,10 @@ class Executor {
|
|||
});
|
||||
}
|
||||
|
||||
LinearAllocator<> &get_main_or_local_allocator()
|
||||
LocalPool<> &get_main_or_local_allocator()
|
||||
{
|
||||
if (this->use_multi_threading()) {
|
||||
return thread_locals_->local().allocator;
|
||||
return thread_locals_->local().local_pool;
|
||||
}
|
||||
return main_allocator_;
|
||||
}
|
||||
|
@ -1184,7 +1187,7 @@ class GraphExecutorLFParams final : public Params {
|
|||
OutputState &output_state = node_state_.outputs[index];
|
||||
BLI_assert(!output_state.has_been_computed);
|
||||
if (output_state.value == nullptr) {
|
||||
LinearAllocator<> &allocator = executor_.get_main_or_local_allocator();
|
||||
LocalPool<> &allocator = executor_.get_main_or_local_allocator();
|
||||
const CPPType &type = node_.output(index).type();
|
||||
output_state.value = allocator.allocate(type.size(), type.alignment());
|
||||
}
|
||||
|
@ -1296,7 +1299,7 @@ void GraphExecutor::execute_impl(Params ¶ms, const Context &context) const
|
|||
executor.execute(params, context);
|
||||
}
|
||||
|
||||
void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const
|
||||
void *GraphExecutor::init_storage(LocalPool<> &allocator) const
|
||||
{
|
||||
Executor &executor = *allocator.construct<Executor>(*this).release();
|
||||
return &executor;
|
||||
|
|
|
@ -1163,7 +1163,8 @@ static GeometrySet compute_geometry(
|
|||
blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
|
||||
user_data.compute_context = &modifier_compute_context;
|
||||
|
||||
blender::LinearAllocator<> allocator;
|
||||
blender::LocalPoolScope local_pool_scope;
|
||||
blender::LocalPool<> allocator(local_pool_scope);
|
||||
Vector<GMutablePointer> inputs_to_destruct;
|
||||
|
||||
int input_index;
|
||||
|
|
|
@ -689,7 +689,7 @@ class LazyFunctionForGroupNode : public LazyFunction {
|
|||
graph_executor_->execute(params, group_context);
|
||||
}
|
||||
|
||||
void *init_storage(LinearAllocator<> &allocator) const override
|
||||
void *init_storage(LocalPool<> &allocator) const override
|
||||
{
|
||||
Storage *s = allocator.construct<Storage>().release();
|
||||
s->graph_executor_storage = graph_executor_->init_storage(allocator);
|
||||
|
|
Loading…
Reference in New Issue