WIP: Functions: new local allocator for better memory reuse and performance #104630

Draft
Jacques Lucke wants to merge 44 commits from JacquesLucke/blender:local-allocator into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
5 changed files with 120 additions and 66 deletions
Showing only changes of commit a5bf084fb6 - Show all commits

View File

@ -145,7 +145,7 @@ inline LocalAllocatorSet &LocalAllocator::owner_set()
return owner_set_;
}
inline void *LocalAllocator::allocate(const int64_t size, const int64_t alignment)
BLI_NOINLINE inline void *LocalAllocator::allocate(const int64_t size, const int64_t alignment)
{
BLI_assert(size > 0);
BLI_assert(alignment <= size);
@ -180,9 +180,9 @@ inline void *LocalAllocator::allocate(const int64_t size, const int64_t alignmen
return buffer;
}
inline void LocalAllocator::deallocate(const void *buffer,
const int64_t size,
const int64_t alignment)
BLI_NOINLINE inline void LocalAllocator::deallocate(const void *buffer,
const int64_t size,
const int64_t alignment)
{
BLI_assert(size > 0);
BLI_assert(alignment <= size);

View File

@ -12,24 +12,30 @@
* - Pass cached data to called functions.
*/
#include "BLI_local_allocator.hh"
#include "BLI_utildefines.h"
#include "BLI_map.hh"
namespace blender::fn {
class MFContext;
class MFContextBuilder {
private:
Map<std::string, const void *> global_contexts_;
std::unique_ptr<LocalAllocatorSet> allocator_set_;
LocalAllocator *allocator_;
friend MFContext;
public:
template<typename T> void add_global_context(std::string name, const T *context)
MFContextBuilder(LocalAllocator *allocator = nullptr)
{
global_contexts_.add_new(std::move(name), static_cast<const void *>(context));
if (allocator) {
allocator_ = allocator;
}
else {
allocator_set_ = std::make_unique<LocalAllocatorSet>();
allocator_ = &allocator_set_->local();
}
}
};
@ -42,11 +48,9 @@ class MFContext {
{
}
template<typename T> const T *get_global_context(StringRef name) const
LocalAllocator &allocator()
{
const void *context = builder_.global_contexts_.lookup_default_as(name, nullptr);
/* TODO: Implement type checking. */
return static_cast<const T *>(context);
return *builder_.allocator_;
}
};

View File

@ -127,7 +127,8 @@ void MultiFunction::call_auto(IndexMask mask, MFParams params, MFContext context
}
}
this->call(offset_mask, offset_params, context);
MFContextBuilder sub_context{&context.allocator().local()};
this->call(offset_mask, offset_params, sub_context);
});
}

View File

@ -132,32 +132,72 @@ class ValueAllocator : NonCopyable, NonMovable {
static constexpr inline int min_alignment = 64;
/** All buffers in the free-lists below have been allocated with this allocator. */
LinearAllocator<> &linear_allocator_;
LocalAllocator &local_allocator_;
int array_size_;
/**
* Use stacks so that the most recently used buffers are reused first. This improves cache
* efficiency.
*/
std::array<Stack<VariableValue *>, tot_variable_value_types> variable_value_free_lists_;
std::array<Vector<VariableValue *>, tot_variable_value_types> variable_value_free_lists_;
/**
* The integer key is the size of one element (e.g. 4 for an integer buffer). All buffers are
* aligned to #min_alignment bytes.
*/
Stack<void *> small_span_buffers_free_list_;
Map<int, Stack<void *>> span_buffers_free_lists_;
Vector<void *> small_span_buffers_free_list_;
Map<int, Vector<void *>> span_buffers_free_lists_;
/** Cache buffers for single values of different types. */
static constexpr inline int small_value_max_size = 16;
static constexpr inline int small_value_max_alignment = 8;
Stack<void *> small_single_value_free_list_;
Map<const CPPType *, Stack<void *>> single_value_free_lists_;
Vector<void *> small_single_value_free_list_;
Map<const CPPType *, Vector<void *>> single_value_free_lists_;
public:
ValueAllocator(LinearAllocator<> &linear_allocator) : linear_allocator_(linear_allocator)
ValueAllocator(LocalAllocator &local_allocator, const int array_size)
: local_allocator_(local_allocator), array_size_(array_size)
{
}
~ValueAllocator()
{
this->deallocate_variable_values<VariableValue_GVArray>();
this->deallocate_variable_values<VariableValue_Span>();
this->deallocate_variable_values<VariableValue_GVVectorArray>();
this->deallocate_variable_values<VariableValue_GVectorArray>();
this->deallocate_variable_values<VariableValue_OneSingle>();
this->deallocate_variable_values<VariableValue_OneVector>();
for (void *buffer : small_span_buffers_free_list_) {
local_allocator_.deallocate(buffer, small_value_max_size, small_value_max_alignment);
}
for (const auto item : span_buffers_free_lists_.items()) {
const int element_size = item.key;
for (const void *buffer : item.value) {
local_allocator_.deallocate(buffer, element_size * array_size_, min_alignment);
}
}
for (void *buffer : small_single_value_free_list_) {
local_allocator_.deallocate(buffer, small_value_max_size, small_value_max_alignment);
}
for (const auto item : single_value_free_lists_.items()) {
const CPPType &type = *item.key;
for (const void *buffer : item.value) {
local_allocator_.deallocate(buffer, type.size(), type.alignment());
}
}
}
template<typename T> void deallocate_variable_values()
{
for (VariableValue *value : variable_value_free_lists_[int(T::static_type)]) {
local_allocator_.deallocate(value, sizeof(T), alignof(T));
}
}
VariableValue_GVArray *obtain_GVArray(const GVArray &varray)
{
return this->obtain<VariableValue_GVArray>(varray);
@ -173,7 +213,7 @@ class ValueAllocator : NonCopyable, NonMovable {
return this->obtain<VariableValue_Span>(buffer, false);
}
VariableValue_Span *obtain_Span(const CPPType &type, int size)
VariableValue_Span *obtain_Span(const CPPType &type)
{
void *buffer = nullptr;
@ -182,20 +222,20 @@ class ValueAllocator : NonCopyable, NonMovable {
if (alignment > min_alignment) {
/* In this rare case we fallback to not reusing existing buffers. */
buffer = linear_allocator_.allocate(element_size * size, alignment);
buffer = local_allocator_.allocate(element_size * array_size_, alignment);
}
else {
Stack<void *> *stack = type.can_exist_in_buffer(small_value_max_size,
small_value_max_alignment) ?
&small_span_buffers_free_list_ :
span_buffers_free_lists_.lookup_ptr(element_size);
Vector<void *> *stack = type.can_exist_in_buffer(small_value_max_size,
small_value_max_alignment) ?
&small_span_buffers_free_list_ :
span_buffers_free_lists_.lookup_ptr(element_size);
if (stack == nullptr || stack->is_empty()) {
buffer = linear_allocator_.allocate(
std::max<int64_t>(element_size, small_value_max_size) * size, min_alignment);
buffer = local_allocator_.allocate(
std::max<int64_t>(element_size, small_value_max_size) * array_size_, min_alignment);
}
else {
/* Reuse existing buffer. */
buffer = stack->pop();
buffer = stack->pop_last();
}
}
@ -207,9 +247,9 @@ class ValueAllocator : NonCopyable, NonMovable {
return this->obtain<VariableValue_GVectorArray>(data, false);
}
VariableValue_GVectorArray *obtain_GVectorArray(const CPPType &type, int size)
VariableValue_GVectorArray *obtain_GVectorArray(const CPPType &type)
{
GVectorArray *vector_array = new GVectorArray(type, size);
GVectorArray *vector_array = new GVectorArray(type, array_size_);
return this->obtain<VariableValue_GVectorArray>(*vector_array, true);
}
@ -217,16 +257,16 @@ class ValueAllocator : NonCopyable, NonMovable {
{
const bool is_small = type.can_exist_in_buffer(small_value_max_size,
small_value_max_alignment);
Stack<void *> &stack = is_small ? small_single_value_free_list_ :
single_value_free_lists_.lookup_or_add_default(&type);
Vector<void *> &stack = is_small ? small_single_value_free_list_ :
single_value_free_lists_.lookup_or_add_default(&type);
void *buffer;
if (stack.is_empty()) {
buffer = linear_allocator_.allocate(
buffer = local_allocator_.allocate(
std::max<int>(small_value_max_size, type.size()),
std::max<int>(small_value_max_alignment, type.alignment()));
}
else {
buffer = stack.pop();
buffer = stack.pop_last();
}
return this->obtain<VariableValue_OneSingle>(buffer);
}
@ -248,11 +288,12 @@ class ValueAllocator : NonCopyable, NonMovable {
if (value_typed->owned) {
const CPPType &type = data_type.single_type();
/* Assumes all values in the buffer are uninitialized already. */
Stack<void *> &buffers = type.can_exist_in_buffer(small_value_max_size,
small_value_max_alignment) ?
small_span_buffers_free_list_ :
span_buffers_free_lists_.lookup_or_add_default(type.size());
buffers.push(value_typed->data);
Vector<void *> &buffers = type.can_exist_in_buffer(small_value_max_size,
small_value_max_alignment) ?
small_span_buffers_free_list_ :
span_buffers_free_lists_.lookup_or_add_default(
type.size());
buffers.append(value_typed->data);
}
break;
}
@ -275,10 +316,10 @@ class ValueAllocator : NonCopyable, NonMovable {
const bool is_small = type.can_exist_in_buffer(small_value_max_size,
small_value_max_alignment);
if (is_small) {
small_single_value_free_list_.push(value_typed->data);
small_single_value_free_list_.append(value_typed->data);
}
else {
single_value_free_lists_.lookup_or_add_default(&type).push(value_typed->data);
single_value_free_lists_.lookup_or_add_default(&type).append(value_typed->data);
}
break;
}
@ -289,20 +330,20 @@ class ValueAllocator : NonCopyable, NonMovable {
}
}
Stack<VariableValue *> &stack = variable_value_free_lists_[int(value->type)];
stack.push(value);
Vector<VariableValue *> &stack = variable_value_free_lists_[int(value->type)];
stack.append(value);
}
private:
template<typename T, typename... Args> T *obtain(Args &&...args)
{
static_assert(std::is_base_of_v<VariableValue, T>);
Stack<VariableValue *> &stack = variable_value_free_lists_[int(T::static_type)];
Vector<VariableValue *> &stack = variable_value_free_lists_[int(T::static_type)];
if (stack.is_empty()) {
void *buffer = linear_allocator_.allocate(sizeof(T), alignof(T));
void *buffer = local_allocator_.allocate(sizeof(T), alignof(T));
return new (buffer) T(std::forward<Args>(args)...);
}
return new (stack.pop()) T(std::forward<Args>(args)...);
return new (stack.pop_last()) T(std::forward<Args>(args)...);
}
};
@ -414,7 +455,7 @@ class VariableState : NonCopyable, NonMovable {
const CPPType &type = data_type.single_type();
VariableValue_Span *new_value = nullptr;
if (caller_provided_storage_ == nullptr) {
new_value = value_allocator.obtain_Span(type, array_size);
new_value = value_allocator.obtain_Span(type);
}
else {
/* Reuse the storage provided caller when possible. */
@ -445,7 +486,7 @@ class VariableState : NonCopyable, NonMovable {
const CPPType &type = data_type.vector_base_type();
VariableValue_GVectorArray *new_value = nullptr;
if (caller_provided_storage_ == nullptr) {
new_value = value_allocator.obtain_GVectorArray(type, array_size);
new_value = value_allocator.obtain_GVectorArray(type);
}
else {
new_value = value_allocator.obtain_GVectorArray_not_owned(
@ -829,10 +870,10 @@ class VariableStates {
IndexMask full_mask_;
public:
VariableStates(LinearAllocator<> &linear_allocator,
VariableStates(LocalAllocator &local_allocator,
const MFProcedure &procedure,
IndexMask full_mask)
: value_allocator_(linear_allocator),
: value_allocator_(local_allocator, full_mask.min_array_size()),
procedure_(procedure),
variable_states_(procedure.variables().size()),
full_mask_(full_mask)
@ -1178,11 +1219,8 @@ void MFProcedureExecutor::call(IndexMask full_mask, MFParams params, MFContext c
{
BLI_assert(procedure_.validate());
AlignedBuffer<512, 64> local_buffer;
LinearAllocator<> linear_allocator;
linear_allocator.provide_buffer(local_buffer);
VariableStates variable_states{linear_allocator, procedure_, full_mask};
LocalAllocator &local_allocator = context.allocator();
VariableStates variable_states{local_allocator, procedure_, full_mask};
variable_states.add_initial_variable_states(*this, procedure_, params);
InstructionScheduler scheduler;

View File

@ -304,6 +304,7 @@ class LazyFunctionForUndefinedNode : public LazyFunction {
* values. If any input is a field, the outputs will also be fields.
*/
static void execute_multi_function_on_value_or_field(
LocalAllocator &allocator,
const MultiFunction &fn,
const std::shared_ptr<MultiFunction> &owned_fn,
const Span<const ValueOrFieldCPPType *> input_types,
@ -354,7 +355,7 @@ static void execute_multi_function_on_value_or_field(
else {
/* In this case, the multi-function is evaluated directly. */
MFParamsBuilder params{fn, 1};
MFContextBuilder context;
MFContextBuilder context{&allocator};
for (const int i : input_types.index_range()) {
const ValueOrFieldCPPType &type = *input_types[i];
@ -412,7 +413,7 @@ class LazyFunctionForMutedNode : public LazyFunction {
}
}
void execute_impl(lf::Params &params, const lf::Context & /*context*/) const override
void execute_impl(lf::Params &params, const lf::Context &context) const override
{
for (const int output_i : outputs_.index_range()) {
if (params.output_was_set(output_i)) {
@ -446,8 +447,13 @@ class LazyFunctionForMutedNode : public LazyFunction {
if (conversions.is_convertible(from_type->value, to_type->value)) {
const MultiFunction &multi_fn = *conversions.get_conversion_multi_function(
MFDataType::ForSingle(from_type->value), MFDataType::ForSingle(to_type->value));
execute_multi_function_on_value_or_field(
multi_fn, {}, {from_type}, {to_type}, {input_value}, {output_value});
execute_multi_function_on_value_or_field(*context.allocator,
multi_fn,
{},
{from_type},
{to_type},
{input_value},
{output_value});
}
params.output_set(output_i);
continue;
@ -480,7 +486,7 @@ class LazyFunctionForMultiFunctionConversion : public LazyFunction {
outputs_.append({"To", to.self});
}
void execute_impl(lf::Params &params, const lf::Context & /*context*/) const override
void execute_impl(lf::Params &params, const lf::Context &context) const override
{
const void *from_value = params.try_get_input_data_ptr(0);
void *to_value = params.get_output_data_ptr(0);
@ -488,7 +494,7 @@ class LazyFunctionForMultiFunctionConversion : public LazyFunction {
BLI_assert(to_value != nullptr);
execute_multi_function_on_value_or_field(
fn_, {}, {&from_type_}, {&to_type_}, {from_value}, {to_value});
*context.allocator, fn_, {}, {&from_type_}, {&to_type_}, {from_value}, {to_value});
params.output_set(0);
}
@ -521,7 +527,7 @@ class LazyFunctionForMultiFunctionNode : public LazyFunction {
}
}
void execute_impl(lf::Params &params, const lf::Context & /*context*/) const override
void execute_impl(lf::Params &params, const lf::Context &context) const override
{
Vector<const void *> input_values(inputs_.size());
Vector<void *> output_values(outputs_.size());
@ -531,8 +537,13 @@ class LazyFunctionForMultiFunctionNode : public LazyFunction {
for (const int i : outputs_.index_range()) {
output_values[i] = params.get_output_data_ptr(i);
}
execute_multi_function_on_value_or_field(
*fn_item_.fn, fn_item_.owned_fn, input_types_, output_types_, input_values, output_values);
execute_multi_function_on_value_or_field(*context.allocator,
*fn_item_.fn,
fn_item_.owned_fn,
input_types_,
output_types_,
input_values,
output_values);
for (const int i : outputs_.index_range()) {
params.output_set(i);
}