10 changed files with 185 additions and 25 deletions
--- a/source/blender/blenlib/BLI_local_pool.hh
+++ b/source/blender/blenlib/BLI_local_pool.hh
@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#pragma once
+
+#include <cstddef>
+
+#include "BLI_allocator.hh"
+#include "BLI_asan.h"
+#include "BLI_map.hh"
+#include "BLI_math_bits.h"
+#include "BLI_stack.hh"
+#include "BLI_utility_mixins.hh"
+#include "BLI_vector.hh"
+
+namespace blender {
+
+class LocalPoolScope {
+};
+
+template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, NonMovable {
+ private:
+  static constexpr int64_t s_alignment = 64;
+
+  const LocalPoolScope &pool_scope_;
+  Vector<MutableSpan<std::byte>> owned_buffers_;
+
+  struct BufferStack {
+    int64_t element_size = -1;
+    Stack<void *, 0> stack;
+  };
+
+  std::array<BufferStack, 8> small_stacks_;
+  std::unique_ptr<Map<int, BufferStack>> large_stacks_;
+
+  BLI_NO_UNIQUE_ADDRESS Allocator allocator_;
+
+ public:
+  LocalPool(const LocalPoolScope &pool_scope) : pool_scope_(pool_scope)
+  {
+    for (const int64_t i : IndexRange(small_stacks_.size())) {
+      small_stacks_[i].element_size = 8 * (i + 1);
+    }
+  }
+
+  ~LocalPool()
+  {
+    for (MutableSpan<std::byte> buffer : owned_buffers_) {
+      BLI_asan_unpoison(buffer.data(), buffer.size());
+      allocator_.deallocate(buffer.data());
+    }
+  }
+
+  void *allocate(const int64_t size, const int64_t alignment)
+  {
+    BLI_assert((size == 0 || alignment <= size) && alignment <= s_alignment);
+    BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
+    if (!buffer_stack.stack.is_empty()) {
+      void *buffer = buffer_stack.stack.pop();
+      BLI_asan_unpoison(buffer, size);
+      return buffer;
+    }
+    if (size <= 4096) {
+      const int64_t allocation_size = std::clamp<int64_t>(
+          buffer_stack.element_size * 16, 512, 4096);
+      void *buffer = allocator_.allocate(allocation_size, s_alignment, __func__);
+      BLI_asan_poison(buffer, allocation_size);
+      const int64_t num = allocation_size / buffer_stack.element_size;
+      for (int64_t i = num - 1; i > 0; i--) {
+        buffer_stack.stack.push(POINTER_OFFSET(buffer, buffer_stack.element_size * i));
+      }
+      owned_buffers_.append({static_cast<std::byte *>(buffer), allocation_size});
+      BLI_asan_unpoison(buffer, size);
+      return buffer;
+    }
+    void *buffer = allocator_.allocate(
+        size_t(size), std::max<size_t>(s_alignment, size_t(alignment)), __func__);
+    owned_buffers_.append({static_cast<std::byte *>(buffer), size});
+    return buffer;
+  }
+
+  void deallocate(const void *buffer, const int64_t size, const int64_t alignment)
+  {
+    BLI_assert((size == 0 || alignment <= size) && alignment <= s_alignment);
+#ifdef DEBUG
+    memset(buffer, -1, size);
+#endif
+    BLI_asan_poison(buffer, alignment);
+    BufferStack &buffer_stack = this->get_buffer_stack(size, alignment);
+    buffer_stack.stack.push(buffer);
+  }
+
+  template<typename T, typename... Args> destruct_ptr<T> construct(Args &&...args)
+  {
+    void *buffer = this->allocate(sizeof(T), alignof(T));
+    T *value = new (buffer) T(std::forward<Args>(args)...);
+    return destruct_ptr<T>(value);
+  }
+
+  template<typename T> MutableSpan<T> allocate_array(int64_t size)
+  {
+    T *array = static_cast<T *>(this->allocate(sizeof(T) * size, alignof(T)));
+    return MutableSpan<T>(array, size);
+  }
+
+  template<typename T, typename... Args>
+  MutableSpan<T> construct_array(int64_t size, Args &&...args)
+  {
+    MutableSpan<T> array = this->allocate_array<T>(size);
+    for (const int64_t i : IndexRange(size)) {
+      new (&array[i]) T(std::forward<Args>(args)...);
+    }
+    return array;
+  }
+
+ private:
+  BufferStack &get_buffer_stack(const int64_t size, const int64_t /*alignment*/)
+  {
+    if (size <= 64) {
+      return small_stacks_[(size - (size != 0)) >> 3];
+    }
+    if (!large_stacks_) {
+      large_stacks_ = std::make_unique<Map<int, BufferStack>>();
+    }
+    const int key = bitscan_reverse_uint64(uint64_t(size));
+    return large_stacks_->lookup_or_add_cb(key, [&]() {
+      BufferStack buffer_stack;
+      buffer_stack.element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
+      return buffer_stack;
+    });
+  }
+};
+
+}  // namespace blender
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@ -255,6 +255,7 @@ set(SRC
  BLI_linklist_stack.h
  BLI_listbase.h
  BLI_listbase_wrapper.hh
+  BLI_local_pool.hh
  BLI_map.hh
  BLI_map_slots.hh
  BLI_math.h
@ -479,6 +480,7 @@ if(WITH_GTESTS)
    tests/BLI_linear_allocator_test.cc
    tests/BLI_linklist_lockfree_test.cc
    tests/BLI_listbase_test.cc
+    tests/BLI_local_pool_test.cc
    tests/BLI_map_test.cc
    tests/BLI_math_base_safe_test.cc
    tests/BLI_math_base_test.cc
--- a/source/blender/blenlib/tests/BLI_local_pool_test.cc
+++ b/source/blender/blenlib/tests/BLI_local_pool_test.cc
@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+#include "BLI_local_pool.hh"
+#include "BLI_strict_flags.h"
+
+#include "testing/testing.h"
+
+namespace blender::tests {
+
+TEST(local_pool, Test)
+{
+  LocalPoolScope pool_scope;
+  LocalPool pool(pool_scope);
+
+  std::cout << pool.allocate(30000, 8) << "\n";
+}
+
+}  // namespace blender::tests
--- a/source/blender/functions/FN_lazy_function.hh
+++ b/source/blender/functions/FN_lazy_function.hh
@ -42,6 +42,7 @@
 #include "BLI_function_ref.hh"
 #include "BLI_generic_pointer.hh"
 #include "BLI_linear_allocator.hh"
+#include "BLI_local_pool.hh"
 #include "BLI_vector.hh"

 #include <atomic>
@ -98,6 +99,8 @@ struct Context {
   * Custom user data that can be used in the function.
   */
  UserData *user_data;
+
+  LocalPool<> *local_pool = nullptr;
 };

 /**
@ -276,7 +279,7 @@ class LazyFunction {
   * Allocates storage for this function. The storage will be passed to every call to #execute.
   * If the function does not keep track of any state, this does not have to be implemented.
   */
-  virtual void *init_storage(LinearAllocator<> &allocator) const;
+  virtual void *init_storage(LocalPool<> &allocator) const;

  /**
   * Destruct the storage created in #init_storage.
--- a/source/blender/functions/FN_lazy_function_execute.hh
+++ b/source/blender/functions/FN_lazy_function_execute.hh
@ -85,7 +85,8 @@ inline void execute_lazy_function_eagerly_impl(
      ...);
  output_usages.fill(ValueUsage::Used);
  set_outputs.fill(false);
-  LinearAllocator<> allocator;
+  LocalPoolScope local_pool_scope;
+  LocalPool<> allocator(local_pool_scope);
  Context context;
  context.user_data = user_data;
  context.storage = fn.init_storage(allocator);
--- a/source/blender/functions/FN_lazy_function_graph_executor.hh
+++ b/source/blender/functions/FN_lazy_function_graph_executor.hh
@ -88,7 +88,7 @@ class GraphExecutor : public LazyFunction {
                const Logger *logger,
                const SideEffectProvider *side_effect_provider);

-  void *init_storage(LinearAllocator<> &allocator) const override;
+  void *init_storage(LocalPool<> &allocator) const override;
  void destruct_storage(void *storage) const override;

 private:
--- a/source/blender/functions/intern/lazy_function.cc
+++ b/source/blender/functions/intern/lazy_function.cc
@ -25,7 +25,7 @@ std::string LazyFunction::output_name(int index) const
  return outputs_[index].debug_name;
 }

-void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const
+void *LazyFunction::init_storage(LocalPool<> & /*allocator*/) const
 {
  return nullptr;
 }
--- a/source/blender/functions/intern/lazy_function_graph_executor.cc
+++ b/source/blender/functions/intern/lazy_function_graph_executor.cc
@ -241,15 +241,16 @@ class Executor {
 #ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
  std::thread::id current_main_thread_;
 #endif
-  /**
-   * A separate linear allocator for every thread. We could potentially reuse some memory, but that
-   * doesn't seem worth it yet.
-   */
+  LocalPoolScope local_pool_scope_;
  struct ThreadLocalData {
-    LinearAllocator<> allocator;
+    LocalPool<> local_pool;
+
+    ThreadLocalData(const LocalPoolScope &local_pool_scope) : local_pool(local_pool_scope)
+    {
+    }
  };
  std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
-  LinearAllocator<> main_allocator_;
+  LocalPool<> main_allocator_;
  /**
   * Set to false when the first execution ends.
   */
@ -258,7 +259,8 @@ class Executor {
  friend GraphExecutorLFParams;

 public:
-  Executor(const GraphExecutor &self) : self_(self), loaded_inputs_(self.graph_inputs_.size())
+  Executor(const GraphExecutor &self)
+      : self_(self), loaded_inputs_(self.graph_inputs_.size()), main_allocator_(local_pool_scope_)
  {
    /* The indices are necessary, because they are used as keys in #node_states_. */
    BLI_assert(self_.graph_.node_indices_are_valid());
@ -340,7 +342,7 @@ class Executor {
    Span<const Node *> nodes = self_.graph_.nodes();
    node_states_.reinitialize(nodes.size());

-    auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
+    auto construct_node_range = [&](const IndexRange range, LocalPool<> &allocator) {
      for (const int i : range) {
        const Node &node = *nodes[i];
        NodeState &node_state = *allocator.construct<NodeState>().release();
@ -355,13 +357,13 @@ class Executor {
      this->ensure_thread_locals();
      /* Construct all node states in parallel. */
      threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
-        LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+        LocalPool<> &allocator = this->get_main_or_local_allocator();
        construct_node_range(range, allocator);
      });
    }
  }

-  void construct_initial_node_state(LinearAllocator<> &allocator,
+  void construct_initial_node_state(LocalPool<> &allocator,
                                    const Node &node,
                                    NodeState &node_state)
  {
@ -533,7 +535,7 @@ class Executor {

  void forward_newly_provided_inputs(CurrentTask &current_task)
  {
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalPool<> &allocator = this->get_main_or_local_allocator();
    for (const int graph_input_index : self_.graph_inputs_.index_range()) {
      std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
      if (was_loaded.load()) {
@ -552,7 +554,7 @@ class Executor {
  }

  void forward_newly_provided_input(CurrentTask &current_task,
-                                    LinearAllocator<> &allocator,
+                                    LocalPool<> &allocator,
                                    const int graph_input_index,
                                    void *input_data)
  {
@ -706,7 +708,7 @@ class Executor {
  void run_node_task(const FunctionNode &node, CurrentTask &current_task)
  {
    NodeState &node_state = *node_states_[node.index_in_graph()];
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalPool<> &allocator = this->get_main_or_local_allocator();
    const LazyFunction &fn = node.function();

    bool node_needs_execution = false;
@ -965,7 +967,7 @@ class Executor {
                                      CurrentTask &current_task)
  {
    BLI_assert(value_to_forward.get() != nullptr);
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalPool<> &allocator = this->get_main_or_local_allocator();
    const CPPType &type = *value_to_forward.type();

    if (self_.logger_ != nullptr) {
@ -1091,7 +1093,8 @@ class Executor {
    }
 #endif
    if (!thread_locals_) {
-      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
+      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>(
+          [scope = &local_pool_scope_]() { return ThreadLocalData{*scope}; });
    }
  }

@ -1130,10 +1133,10 @@ class Executor {
        });
  }

-  LinearAllocator<> &get_main_or_local_allocator()
+  LocalPool<> &get_main_or_local_allocator()
  {
    if (this->use_multi_threading()) {
-      return thread_locals_->local().allocator;
+      return thread_locals_->local().local_pool;
    }
    return main_allocator_;
  }
@ -1184,7 +1187,7 @@ class GraphExecutorLFParams final : public Params {
    OutputState &output_state = node_state_.outputs[index];
    BLI_assert(!output_state.has_been_computed);
    if (output_state.value == nullptr) {
-      LinearAllocator<> &allocator = executor_.get_main_or_local_allocator();
+      LocalPool<> &allocator = executor_.get_main_or_local_allocator();
      const CPPType &type = node_.output(index).type();
      output_state.value = allocator.allocate(type.size(), type.alignment());
    }
@ -1296,7 +1299,7 @@ void GraphExecutor::execute_impl(Params &params, const Context &context) const
  executor.execute(params, context);
 }

-void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const
+void *GraphExecutor::init_storage(LocalPool<> &allocator) const
 {
  Executor &executor = *allocator.construct<Executor>(*this).release();
  return &executor;
--- a/source/blender/modifiers/intern/MOD_nodes.cc
+++ b/source/blender/modifiers/intern/MOD_nodes.cc
@ -1163,7 +1163,8 @@ static GeometrySet compute_geometry(
  blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
  user_data.compute_context = &modifier_compute_context;

-  blender::LinearAllocator<> allocator;
+  blender::LocalPoolScope local_pool_scope;
+  blender::LocalPool<> allocator(local_pool_scope);
  Vector<GMutablePointer> inputs_to_destruct;

  int input_index;
--- a/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
+++ b/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
@ -689,7 +689,7 @@ class LazyFunctionForGroupNode : public LazyFunction {
    graph_executor_->execute(params, group_context);
  }

-  void *init_storage(LinearAllocator<> &allocator) const override
+  void *init_storage(LocalPool<> &allocator) const override
  {
    Storage *s = allocator.construct<Storage>().release();
    s->graph_executor_storage = graph_executor_->init_storage(allocator);