13 changed files with 548 additions and 134 deletions
--- a/source/blender/blenlib/BLI_local_allocator.hh
+++ b/source/blender/blenlib/BLI_local_allocator.hh
@ -0,0 +1,331 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#pragma once
+
+#include <cstddef>
+
+#include "BLI_allocator.hh"
+#include "BLI_asan.h"
+#include "BLI_enumerable_thread_specific.hh"
+#include "BLI_linear_allocator.hh"
+#include "BLI_map.hh"
+#include "BLI_math_bits.h"
+#include "BLI_stack.hh"
+#include "BLI_utility_mixins.hh"
+#include "BLI_vector.hh"
+
+// #define BLI_LOCAL_ALLOCATOR_USE_GUARDED
+// #define BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
+
+namespace blender {
+
+class LocalAllocatorSet;
+class LocalAllocator;
+class LocalAllocatorPool;
+
+class LocalAllocatorPool : NonCopyable, NonMovable {
+ private:
+  Stack<void *> buffers;
+  int64_t element_size = -1;
+  int64_t alignment = -1;
+
+  friend LocalAllocator;
+};
+
+class LocalAllocator : NonCopyable, NonMovable {
+ private:
+  static constexpr int64_t s_alignment = 64;
+  static constexpr int64_t s_global_allocation_threshold = 5 * 1024 * 1024;
+  LocalAllocatorSet &owner_set_;
+  AlignedBuffer<256, 64> initial_buffer_;
+  LinearAllocator<> linear_allocator_;
+
+  struct Head {
+    int64_t buffer_size;
+    int64_t buffer_alignment;
+  };
+  static_assert(is_power_of_2_constexpr(sizeof(Head)));
+
+  std::array<LocalAllocatorPool, 8> small_buffer_pools_;
+  Map<int, std::unique_ptr<LocalAllocatorPool>> large_buffer_pools_;
+
+  friend LocalAllocatorSet;
+
+  LocalAllocator(LocalAllocatorSet &owner_set);
+
+ public:
+  ~LocalAllocator();
+
+  bool is_local() const;
+  LocalAllocator &local();
+  LocalAllocatorSet &owner_set();
+
+  void *allocate(int64_t size, int64_t alignment);
+  void deallocate(const void *buffer, int64_t size, int64_t alignment);
+
+  void *allocate(LocalAllocatorPool &pool);
+  void deallocate(const void *buffer, LocalAllocatorPool &pool);
+
+  void *allocate_with_head(int64_t size, int64_t alignment);
+  void deallocate_with_head(const void *buffer);
+
+  LocalAllocatorPool &get_pool(int64_t size, int64_t alignment);
+
+  template<typename T, typename... Args> T &allocate_new(Args &&...args);
+  template<typename T, typename... Args> void destruct_free(const T *value);
+  template<typename T> MutableSpan<T> allocate_array(int64_t size);
+  template<typename T, typename... Args>
+  MutableSpan<T> allocate_new_array(int64_t size, Args &&...args);
+  template<typename T> void destruct_free_array(Span<T> data);
+  template<typename T> void destruct_free_array(MutableSpan<T> data);
+};
+
+class LocalAllocatorSet : NonCopyable, NonMovable {
+ private:
+  threading::EnumerableThreadSpecific<LocalAllocator> allocator_by_thread_;
+
+#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
+  std::mutex debug_sizes_mutex_;
+  Map<const void *, int64_t> debug_sizes_;
+#endif
+
+  friend LocalAllocator;
+
+ public:
+  LocalAllocatorSet();
+  ~LocalAllocatorSet();
+
+  LocalAllocator &local();
+};
+
+class ThreadedLocalAllocatorRef {
+ private:
+  LocalAllocatorSet &allocator_set_;
+
+ public:
+  ThreadedLocalAllocatorRef(LocalAllocator &allocator) : allocator_set_(allocator.owner_set())
+  {
+  }
+
+  void *allocate(const size_t size, const size_t alignment, const char * /*name*/)
+  {
+    LocalAllocator &allocator = allocator_set_.local();
+    return allocator.allocate_with_head(size, alignment);
+  }
+
+  void deallocate(void *ptr)
+  {
+    LocalAllocator &allocator = allocator_set_.local();
+    allocator.deallocate_with_head(ptr);
+  }
+};
+
+class LocalAllocatorRef {
+ private:
+  LocalAllocator &allocator_;
+
+ public:
+  LocalAllocatorRef(LocalAllocator &allocator) : allocator_(allocator)
+  {
+  }
+
+  void *allocate(const size_t size, const size_t alignment, const char * /*name*/)
+  {
+    return allocator_.allocate_with_head(size, alignment);
+  }
+
+  void deallocate(void *ptr)
+  {
+    allocator_.deallocate_with_head(ptr);
+  }
+};
+
+inline bool LocalAllocator::is_local() const
+{
+  return this == &owner_set_.local();
+}
+
+inline LocalAllocator &LocalAllocator::local()
+{
+  return owner_set_.local();
+}
+
+inline LocalAllocatorSet &LocalAllocator::owner_set()
+{
+  return owner_set_;
+}
+
+inline void *LocalAllocator::allocate(const int64_t size, const int64_t alignment)
+{
+  LocalAllocatorPool &pool = this->get_pool(size, alignment);
+  BLI_assert(pool.element_size >= size);
+  BLI_assert(pool.alignment >= alignment);
+
+  return this->allocate(pool);
+}
+
+inline void LocalAllocator::deallocate(const void *buffer,
+                                       const int64_t size,
+                                       const int64_t alignment)
+{
+  LocalAllocatorPool &pool = this->get_pool(size, alignment);
+  BLI_assert(pool.element_size >= size);
+  BLI_assert(pool.alignment >= alignment);
+
+  this->deallocate(buffer, pool);
+}
+
+inline void *LocalAllocator::allocate(LocalAllocatorPool &pool)
+{
+  BLI_assert(this->is_local());
+
+#ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED
+  return MEM_mallocN_aligned(pool.element_size, pool.alignment, __func__);
+#endif
+
+  void *buffer;
+  if (!pool.buffers.is_empty()) {
+    buffer = pool.buffers.pop();
+    BLI_asan_unpoison(buffer, pool.element_size);
+  }
+  else if (pool.element_size < s_global_allocation_threshold) {
+    buffer = linear_allocator_.allocate(pool.element_size, pool.alignment);
+  }
+  else {
+    buffer = MEM_mallocN(pool.element_size, __func__);
+  }
+
+#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
+  {
+    std::lock_guard lock{owner_set_.debug_sizes_mutex_};
+    owner_set_.debug_sizes_.add_new(buffer, pool.element_size);
+  }
+#endif
+
+  return buffer;
+}
+
+inline void LocalAllocator::deallocate(const void *buffer, LocalAllocatorPool &pool)
+{
+  BLI_assert(this->is_local());
+
+#ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED
+  MEM_freeN(const_cast<void *>(buffer));
+  return;
+#endif
+
+#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
+  {
+    std::lock_guard lock{owner_set_.debug_sizes_mutex_};
+    auto [last_size, last_alignment] = owner_set_.debug_sizes_.pop(buffer);
+    if (last_size != size) {
+      BLI_assert_unreachable();
+    }
+    if (last_alignment != alignment) {
+      BLI_assert_unreachable();
+    }
+  }
+#endif
+
+#ifdef DEBUG
+  memset(const_cast<void *>(buffer), -1, pool.element_size);
+#endif
+
+  if (pool.element_size < s_global_allocation_threshold) {
+    BLI_asan_poison(buffer, pool.element_size);
+    pool.buffers.push(const_cast<void *>(buffer));
+  }
+  else {
+    MEM_freeN(const_cast<void *>(buffer));
+  }
+}
+
+inline LocalAllocatorPool &LocalAllocator::get_pool(const int64_t size, const int64_t alignment)
+{
+  BLI_assert(size > 0);
+  BLI_assert(alignment <= size);
+  BLI_assert(alignment <= s_alignment);
+  BLI_assert(is_power_of_2_i(alignment));
+  UNUSED_VARS_NDEBUG(alignment);
+
+  BLI_assert(this->is_local());
+  if (size <= 64) {
+    return small_buffer_pools_[(size - 1) >> 3];
+  }
+  const int key = bitscan_reverse_uint64(uint64_t(size));
+  return *large_buffer_pools_.lookup_or_add_cb(key, [&]() {
+    auto pool = std::make_unique<LocalAllocatorPool>();
+    pool->element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
+    pool->alignment = s_alignment;
+    return pool;
+  });
+}
+
+inline void *LocalAllocator::allocate_with_head(int64_t size, int64_t alignment)
+{
+  const int64_t buffer_size = size + std::max<int64_t>(alignment, sizeof(Head));
+  const int64_t buffer_alignment = std::max<int64_t>(alignment, alignof(Head));
+  void *buffer = this->allocate(buffer_size, buffer_alignment);
+  Head *head = new (buffer) Head;
+  head->buffer_size = buffer_size;
+  head->buffer_alignment = buffer_alignment;
+  return head + 1;
+}
+
+inline void LocalAllocator::deallocate_with_head(const void *buffer)
+{
+  const Head *head = static_cast<const Head *>(buffer) - 1;
+  this->deallocate(head, head->buffer_size, head->buffer_alignment);
+}
+
+template<typename T, typename... Args> inline T &LocalAllocator::allocate_new(Args &&...args)
+{
+  void *buffer = this->allocate(sizeof(T), alignof(T));
+  T *value = new (buffer) T(std::forward<Args>(args)...);
+  return *value;
+}
+
+template<typename T, typename... Args> inline void LocalAllocator::destruct_free(const T *value)
+{
+  std::destroy_at(value);
+  this->deallocate(value, sizeof(T), alignof(T));
+}
+
+template<typename T> MutableSpan<T> inline LocalAllocator::allocate_array(const int64_t size)
+{
+  if (size == 0) {
+    return {};
+  }
+  void *buffer = this->allocate(size * sizeof(T), alignof(T));
+  return {static_cast<T *>(buffer), size};
+}
+
+template<typename T, typename... Args>
+MutableSpan<T> inline LocalAllocator::allocate_new_array(const int64_t size, Args &&...args)
+{
+  MutableSpan<T> array = this->allocate_array<T>(size);
+  for (const int64_t i : IndexRange(size)) {
+    new (&array[i]) T(std::forward<Args>(args)...);
+  }
+  return array;
+}
+
+template<typename T> inline void LocalAllocator::destruct_free_array(Span<T> data)
+{
+  if (data.is_empty()) {
+    return;
+  }
+  destruct_n(const_cast<T *>(data.data()), data.size());
+  this->deallocate(data.data(), data.size_in_bytes(), alignof(T));
+}
+
+template<typename T> inline void LocalAllocator::destruct_free_array(MutableSpan<T> data)
+{
+  this->destruct_free_array(data.as_span());
+}
+
+inline LocalAllocator &LocalAllocatorSet::local()
+{
+  return allocator_by_thread_.local();
+}
+
+}  // namespace blender
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@ -91,6 +91,7 @@ set(SRC
  intern/lazy_threading.cc
  intern/length_parameterize.cc
  intern/listbase.cc
+  intern/local_allocator.cc
  intern/math_base.c
  intern/math_base_inline.c
  intern/math_base_safe_inline.c
@ -256,6 +257,7 @@ set(SRC
  BLI_linklist_stack.h
  BLI_listbase.h
  BLI_listbase_wrapper.hh
+  BLI_local_allocator.hh
  BLI_map.hh
  BLI_map_slots.hh
  BLI_math.h
@ -484,6 +486,7 @@ if(WITH_GTESTS)
    tests/BLI_linear_allocator_test.cc
    tests/BLI_linklist_lockfree_test.cc
    tests/BLI_listbase_test.cc
+    tests/BLI_local_allocator_test.cc
    tests/BLI_map_test.cc
    tests/BLI_math_base_safe_test.cc
    tests/BLI_math_base_test.cc
--- a/source/blender/blenlib/intern/local_allocator.cc
+++ b/source/blender/blenlib/intern/local_allocator.cc
@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "BLI_local_allocator.hh"
+
+namespace blender {
+
+LocalAllocatorSet::LocalAllocatorSet()
+    : allocator_by_thread_([this]() { return LocalAllocator{*this}; })
+{
+}
+
+LocalAllocatorSet::~LocalAllocatorSet() = default;
+
+LocalAllocator::LocalAllocator(LocalAllocatorSet &owner_set) : owner_set_(owner_set)
+{
+  linear_allocator_.provide_buffer(initial_buffer_);
+  for (const int64_t i : IndexRange(small_buffer_pools_.size())) {
+    LocalAllocatorPool &pool = small_buffer_pools_[i];
+    pool.element_size = 8 * (i + 1);
+    pool.alignment = power_of_2_min_u(pool.element_size);
+  }
+}
+
+LocalAllocator::~LocalAllocator() = default;
+
+}  // namespace blender
--- a/source/blender/blenlib/tests/BLI_local_allocator_test.cc
+++ b/source/blender/blenlib/tests/BLI_local_allocator_test.cc
@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+#include "BLI_local_allocator.hh"
+#include "BLI_strict_flags.h"
+
+#include "testing/testing.h"
+
+namespace blender::tests {
+
+}  // namespace blender::tests
--- a/source/blender/functions/FN_lazy_function.hh
+++ b/source/blender/functions/FN_lazy_function.hh
@ -42,6 +42,7 @@
 #include "BLI_function_ref.hh"
 #include "BLI_generic_pointer.hh"
 #include "BLI_linear_allocator.hh"
+#include "BLI_local_allocator.hh"
 #include "BLI_vector.hh"

 #include <atomic>
@ -98,6 +99,8 @@ struct Context {
   * Custom user data that can be used in the function.
   */
  UserData *user_data;
+
+  LocalAllocator *allocator;
 };

 /**
@ -276,12 +279,12 @@ class LazyFunction {
   * Allocates storage for this function. The storage will be passed to every call to #execute.
   * If the function does not keep track of any state, this does not have to be implemented.
   */
-  virtual void *init_storage(LinearAllocator<> &allocator) const;
+  virtual void *init_storage(LocalAllocator &allocator) const;

  /**
   * Destruct the storage created in #init_storage.
   */
-  virtual void destruct_storage(void *storage) const;
+  virtual void destruct_storage(void *storage, LocalAllocator &allocator) const;

  /**
   * Calls `fn` with the input indices that the given `output_index` may depend on. By default
--- a/source/blender/functions/FN_lazy_function_execute.hh
+++ b/source/blender/functions/FN_lazy_function_execute.hh
@ -85,14 +85,16 @@ inline void execute_lazy_function_eagerly_impl(
      ...);
  output_usages.fill(ValueUsage::Used);
  set_outputs.fill(false);
-  LinearAllocator<> allocator;
+  LocalAllocatorSet allocator_set;
+  LocalAllocator &allocator = allocator_set.local();
  Context context;
  context.user_data = user_data;
  context.storage = fn.init_storage(allocator);
+  context.allocator = &allocator;
  BasicParams params{
      fn, input_pointers, output_pointers, input_usages, output_usages, set_outputs};
  fn.execute(params, context);
-  fn.destruct_storage(context.storage);
+  fn.destruct_storage(context.storage, allocator);

  /* Make sure all outputs have been computed.  */
  BLI_assert(!Span<bool>(set_outputs).contains(false));
--- a/source/blender/functions/FN_lazy_function_graph_executor.hh
+++ b/source/blender/functions/FN_lazy_function_graph_executor.hh
@ -59,11 +59,23 @@ class GraphExecutor : public LazyFunction {
  using Logger = GraphExecutorLogger;
  using SideEffectProvider = GraphExecutorSideEffectProvider;

+  struct NodeBufferOffsets {
+    int node;
+    int inputs;
+    int outputs;
+  };
+
+  struct PreprocessData {
+    Array<NodeBufferOffsets> offsets;
+    int node_state_buffer_size;
+  };
+
 private:
  /**
   * The graph that is evaluated.
   */
  const Graph &graph_;
+  const PreprocessData &preprocess_data_;
  /**
   * Input and output sockets of the entire graph.
   */
@ -85,11 +97,14 @@ class GraphExecutor : public LazyFunction {
  GraphExecutor(const Graph &graph,
                Span<const OutputSocket *> graph_inputs,
                Span<const InputSocket *> graph_outputs,
+                const PreprocessData &preprocess_data,
                const Logger *logger,
                const SideEffectProvider *side_effect_provider);

-  void *init_storage(LinearAllocator<> &allocator) const override;
-  void destruct_storage(void *storage) const override;
+  void *init_storage(LocalAllocator &allocator) const override;
+  void destruct_storage(void *storage, LocalAllocator &allocator) const override;
+
+  static void preprocess(const Graph &graph, PreprocessData &r_preprocess_data);

 private:
  void execute_impl(Params &params, const Context &context) const override;
--- a/source/blender/functions/intern/lazy_function.cc
+++ b/source/blender/functions/intern/lazy_function.cc
@ -25,12 +25,12 @@ std::string LazyFunction::output_name(int index) const
  return outputs_[index].debug_name;
 }

-void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const
+void *LazyFunction::init_storage(LocalAllocator & /*allocator*/) const
 {
  return nullptr;
 }

-void LazyFunction::destruct_storage(void *storage) const
+void LazyFunction::destruct_storage(void *storage, LocalAllocator & /*allocator*/) const
 {
  BLI_assert(storage == nullptr);
  UNUSED_VARS_NDEBUG(storage);
--- a/source/blender/functions/intern/lazy_function_graph_executor.cc
+++ b/source/blender/functions/intern/lazy_function_graph_executor.cc
@ -75,7 +75,7 @@ enum class NodeScheduleState {
  RunningAndRescheduled,
 };

-struct InputState {
+struct alignas(8) InputState {
  /**
   * Value of this input socket. By default, the value is empty. When other nodes are done
   * computing their outputs, the computed values will be forwarded to linked input sockets. The
@ -97,7 +97,7 @@ struct InputState {
  bool was_ready_for_execution = false;
 };

-struct OutputState {
+struct alignas(8) OutputState {
  /**
   * Keeps track of how the output value is used. If a connected input becomes used, this output
   * has to become used as well. The output becomes unused when it is used by no input socket
@ -127,7 +127,7 @@ struct OutputState {
  void *value = nullptr;
 };

-struct NodeState {
+struct alignas(8) NodeState {
  /**
   * Needs to be locked when any data in this state is accessed that is not explicitly marked as
   * not needing the lock.
@ -271,7 +271,7 @@ class Executor {
  /**
   * State of every node, indexed by #Node::index_in_graph.
   */
-  Array<NodeState *> node_states_;
+  MutableSpan<NodeState *> node_states_;
  /**
   * Parameters provided by the caller. This is always non-null, while a node is running.
   */
@ -285,15 +285,7 @@ class Executor {
 #ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
  std::thread::id current_main_thread_;
 #endif
-  /**
-   * A separate linear allocator for every thread. We could potentially reuse some memory, but that
-   * doesn't seem worth it yet.
-   */
-  struct ThreadLocalData {
-    LinearAllocator<> allocator;
-  };
-  std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
-  LinearAllocator<> main_allocator_;
+
  /**
   * Set to false when the first execution ends.
   */
@ -308,18 +300,25 @@ class Executor {
    BLI_assert(self_.graph_.node_indices_are_valid());
  }

-  ~Executor()
+  void destruct_state(LocalAllocator &allocator)
  {
    if (TaskPool *task_pool = task_pool_.load()) {
      BLI_task_pool_free(task_pool);
    }
    threading::parallel_for(node_states_.index_range(), 1024, [&](const IndexRange range) {
+      LocalAllocator &local_allocator = allocator.local();
      for (const int node_index : range) {
        const Node &node = *self_.graph_.nodes()[node_index];
        NodeState &node_state = *node_states_[node_index];
-        this->destruct_node_state(node, node_state);
+        if (!node_state.node_has_finished) {
+          this->destruct_node_data(node, node_state, local_allocator);
+        }
+        std::destroy_at(&node_state);
      }
    });
+    allocator.deallocate(
+        node_states_[0], self_.preprocess_data_.node_state_buffer_size, alignof(NodeState));
+    allocator.destruct_free_array(node_states_);
  }

  /**
@ -364,7 +363,7 @@ class Executor {
        }
      }

-      this->initialize_static_value_usages(side_effect_nodes);
+      this->initialize_static_value_usages(side_effect_nodes, this->get_local_allocator());
      this->schedule_side_effect_nodes(side_effect_nodes, current_task);
    }

@ -382,54 +381,41 @@ class Executor {
  void initialize_node_states()
  {
    Span<const Node *> nodes = self_.graph_.nodes();
-    node_states_.reinitialize(nodes.size());
+    node_states_ = context_->allocator->allocate_array<NodeState *>(nodes.size());

-    auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
-      for (const int i : range) {
-        const Node &node = *nodes[i];
-        NodeState &node_state = *allocator.construct<NodeState>().release();
-        node_states_[i] = &node_state;
-        this->construct_initial_node_state(allocator, node, node_state);
-      }
-    };
-    if (nodes.size() <= 256) {
-      construct_node_range(nodes.index_range(), main_allocator_);
-    }
-    else {
-      this->ensure_thread_locals();
-      /* Construct all node states in parallel. */
-      threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
-        LinearAllocator<> &allocator = thread_locals_->local().allocator;
-        construct_node_range(range, allocator);
-      });
+    void *node_states_buffer = context_->allocator->allocate(
+        self_.preprocess_data_.node_state_buffer_size, alignof(NodeState));
+
+    for (const int i : nodes.index_range()) {
+      const Node &node = *nodes[i];
+      const GraphExecutor::NodeBufferOffsets &node_offsets = self_.preprocess_data_.offsets[i];
+      void *state_buffer = POINTER_OFFSET(node_states_buffer, node_offsets.node);
+      NodeState *node_state = new (state_buffer) NodeState();
+      node_state->inputs = {
+          static_cast<InputState *>(POINTER_OFFSET(node_states_buffer, node_offsets.inputs)),
+          node.inputs().size()};
+      node_state->outputs = {
+          static_cast<OutputState *>(POINTER_OFFSET(node_states_buffer, node_offsets.outputs)),
+          node.outputs().size()};
+      default_construct_n(node_state->inputs.data(), node_state->inputs.size());
+      default_construct_n(node_state->outputs.data(), node_state->outputs.size());
+      node_states_[i] = node_state;
    }
  }

-  void construct_initial_node_state(LinearAllocator<> &allocator,
-                                    const Node &node,
-                                    NodeState &node_state)
-  {
-    const Span<const InputSocket *> node_inputs = node.inputs();
-    const Span<const OutputSocket *> node_outputs = node.outputs();
-
-    node_state.inputs = allocator.construct_array<InputState>(node_inputs.size());
-    node_state.outputs = allocator.construct_array<OutputState>(node_outputs.size());
-  }
-
-  void destruct_node_state(const Node &node, NodeState &node_state)
+  void destruct_node_data(const Node &node, NodeState &node_state, LocalAllocator &allocator)
  {
    if (node.is_function()) {
      const LazyFunction &fn = static_cast<const FunctionNode &>(node).function();
      if (node_state.storage != nullptr) {
-        fn.destruct_storage(node_state.storage);
+        fn.destruct_storage(node_state.storage, allocator);
      }
    }
    for (const int i : node.inputs().index_range()) {
      InputState &input_state = node_state.inputs[i];
      const InputSocket &input_socket = node.input(i);
-      this->destruct_input_value_if_exists(input_state, input_socket.type());
+      this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
    }
-    std::destroy_at(&node_state);
  }

  /**
@ -453,7 +439,7 @@ class Executor {
          this->set_input_required(locked_node, socket);
        }
        else {
-          this->set_input_unused(locked_node, socket);
+          this->set_input_unused(locked_node, socket, this->get_local_allocator());
        }
      });
    }
@ -500,13 +486,14 @@ class Executor {
   * Most importantly, this function initializes `InputState.usage` and
   * `OutputState.potential_target_sockets`.
   */
-  void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes)
+  void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes,
+                                      LocalAllocator &allocator)
  {
    const Span<const Node *> all_nodes = self_.graph_.nodes();

    /* Used for a search through all nodes that outputs depend on. */
-    Stack<const Node *> reachable_nodes_to_check;
-    Array<bool> reachable_node_flags(all_nodes.size(), false);
+    Stack<const Node *, 16, LocalAllocatorRef> reachable_nodes_to_check{allocator};
+    Array<bool, 16, LocalAllocatorRef> reachable_node_flags{all_nodes.size(), false, allocator};

    /* Graph outputs are always reachable. */
    for (const InputSocket *socket : self_.graph_outputs_) {
@ -586,7 +573,7 @@ class Executor {

  void forward_newly_provided_inputs(CurrentTask &current_task)
  {
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
    for (const int graph_input_index : self_.graph_inputs_.index_range()) {
      std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
      if (was_loaded.load()) {
@ -605,7 +592,7 @@ class Executor {
  }

  void forward_newly_provided_input(CurrentTask &current_task,
-                                    LinearAllocator<> &allocator,
+                                    LocalAllocator &allocator,
                                    const int graph_input_index,
                                    void *input_data)
  {
@ -621,7 +608,6 @@ class Executor {
    const Node &node = socket.node();
    const int index_in_node = socket.index();
    NodeState &node_state = *node_states_[node.index_in_graph()];
-    OutputState &output_state = node_state.outputs[index_in_node];

    /* The notified output socket might be an input of the entire graph. In this case, notify the
     * caller that the input is required. */
@ -640,12 +626,13 @@ class Executor {
        return;
      }
      this->forward_newly_provided_input(
-          current_task, this->get_main_or_local_allocator(), graph_input_index, input_data);
+          current_task, this->get_local_allocator(), graph_input_index, input_data);
      return;
    }

    BLI_assert(node.is_function());
    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
+      OutputState &output_state = node_state.outputs[index_in_node];
      if (output_state.usage == ValueUsage::Used) {
        return;
      }
@ -659,9 +646,9 @@ class Executor {
    const Node &node = socket.node();
    const int index_in_node = socket.index();
    NodeState &node_state = *node_states_[node.index_in_graph()];
-    OutputState &output_state = node_state.outputs[index_in_node];

    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
+      OutputState &output_state = node_state.outputs[index_in_node];
      output_state.potential_target_sockets -= 1;
      if (output_state.potential_target_sockets == 0) {
        BLI_assert(output_state.usage != ValueUsage::Unused);
@ -760,7 +747,7 @@ class Executor {
  void run_node_task(const FunctionNode &node, CurrentTask &current_task)
  {
    NodeState &node_state = *node_states_[node.index_in_graph()];
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
    const LazyFunction &fn = node.function();

    bool node_needs_execution = false;
@ -799,6 +786,7 @@ class Executor {
        node_state.always_used_inputs_requested = true;
      }

+      const bool allow_missing_requested_inputs = fn.allow_missing_requested_inputs();
      for (const int input_index : node_state.inputs.index_range()) {
        InputState &input_state = node_state.inputs[input_index];
        if (input_state.was_ready_for_execution) {
@ -808,7 +796,11 @@ class Executor {
          input_state.was_ready_for_execution = true;
          continue;
        }
-        if (!fn.allow_missing_requested_inputs()) {
+        const InputSocket &socket = node.input(input_index);
+        if (socket.origin() == nullptr) {
+          continue;
+        }
+        if (!allow_missing_requested_inputs) {
          if (input_state.usage == ValueUsage::Used) {
            return;
          }
@ -848,7 +840,7 @@ class Executor {
      /* Importantly, the node must not be locked when it is executed. That would result in locks
       * being hold very long in some cases and results in multiple locks being hold by the same
       * thread in the same graph which can lead to deadlocks. */
-      this->execute_node(node, node_state, current_task);
+      this->execute_node(node, node_state, current_task, allocator);
    }

    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
@ -857,7 +849,7 @@ class Executor {
        this->assert_expected_outputs_have_been_computed(locked_node);
      }
 #endif
-      this->finish_node_if_possible(locked_node);
+      this->finish_node_if_possible(locked_node, allocator);
      const bool reschedule_requested = node_state.schedule_state ==
                                        NodeScheduleState::RunningAndRescheduled;
      node_state.schedule_state = NodeScheduleState::NotScheduled;
@ -895,7 +887,7 @@ class Executor {
    }
  }

-  void finish_node_if_possible(LockedNode &locked_node)
+  void finish_node_if_possible(LockedNode &locked_node, LocalAllocator &allocator)
  {
    const Node &node = locked_node.node;
    NodeState &node_state = locked_node.node_state;
@ -923,44 +915,44 @@ class Executor {
      const InputSocket &input_socket = node.input(input_index);
      InputState &input_state = node_state.inputs[input_index];
      if (input_state.usage == ValueUsage::Maybe) {
-        this->set_input_unused(locked_node, input_socket);
-      }
-      else if (input_state.usage == ValueUsage::Used) {
-        this->destruct_input_value_if_exists(input_state, input_socket.type());
+        this->set_input_unused(locked_node, input_socket, allocator);
      }
    }

-    if (node_state.storage != nullptr) {
-      if (node.is_function()) {
-        const FunctionNode &fn_node = static_cast<const FunctionNode &>(node);
-        fn_node.function().destruct_storage(node_state.storage);
-      }
-      node_state.storage = nullptr;
-    }
+    this->destruct_node_data(node, node_state, allocator);
  }

-  void destruct_input_value_if_exists(InputState &input_state, const CPPType &type)
+  void destruct_input_value_if_exists(InputState &input_state,
+                                      const CPPType &type,
+                                      LocalAllocator &allocator)
  {
    if (input_state.value != nullptr) {
      type.destruct(input_state.value);
+      allocator.deallocate(input_state.value, type.size(), type.alignment());
      input_state.value = nullptr;
    }
  }

-  void execute_node(const FunctionNode &node, NodeState &node_state, CurrentTask &current_task);
+  void execute_node(const FunctionNode &node,
+                    NodeState &node_state,
+                    CurrentTask &current_task,
+                    LocalAllocator &allocator);

  void set_input_unused_during_execution(const Node &node,
                                         NodeState &node_state,
                                         const int input_index,
                                         CurrentTask &current_task)
  {
+    LocalAllocator &allocator = this->get_local_allocator();
    const InputSocket &input_socket = node.input(input_index);
    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
-      this->set_input_unused(locked_node, input_socket);
+      this->set_input_unused(locked_node, input_socket, allocator);
    });
  }

-  void set_input_unused(LockedNode &locked_node, const InputSocket &input_socket)
+  void set_input_unused(LockedNode &locked_node,
+                        const InputSocket &input_socket,
+                        LocalAllocator &allocator)
  {
    NodeState &node_state = locked_node.node_state;
    const int input_index = input_socket.index();
@ -972,7 +964,7 @@ class Executor {
    }
    input_state.usage = ValueUsage::Unused;

-    this->destruct_input_value_if_exists(input_state, input_socket.type());
+    this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
    if (input_state.was_ready_for_execution) {
      return;
    }
@ -1026,7 +1018,7 @@ class Executor {
                                      CurrentTask &current_task)
  {
    BLI_assert(value_to_forward.get() != nullptr);
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
    const CPPType &type = *value_to_forward.type();

    if (self_.logger_ != nullptr) {
@ -1038,17 +1030,7 @@ class Executor {
      const Node &target_node = target_socket->node();
      NodeState &node_state = *node_states_[target_node.index_in_graph()];
      const int input_index = target_socket->index();
-      InputState &input_state = node_state.inputs[input_index];
      const bool is_last_target = target_socket == targets.last();
-#ifdef DEBUG
-      if (input_state.value != nullptr) {
-        if (self_.logger_ != nullptr) {
-          self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_);
-        }
-        BLI_assert_unreachable();
-      }
-#endif
-      BLI_assert(!input_state.was_ready_for_execution);
      BLI_assert(target_socket->type() == type);
      BLI_assert(target_socket->origin() == &from_socket);

@ -1072,6 +1054,18 @@ class Executor {
        continue;
      }
      this->with_locked_node(target_node, node_state, current_task, [&](LockedNode &locked_node) {
+        InputState &input_state = node_state.inputs[input_index];
+
+#ifdef DEBUG
+        if (input_state.value != nullptr) {
+          if (self_.logger_ != nullptr) {
+            self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_);
+          }
+          BLI_assert_unreachable();
+        }
+#endif
+        BLI_assert(!input_state.was_ready_for_execution);
+
        if (input_state.usage == ValueUsage::Unused) {
          return;
        }
@ -1089,6 +1083,7 @@ class Executor {
    }
    if (value_to_forward.get() != nullptr) {
      value_to_forward.destruct();
+      allocator.deallocate(value_to_forward.get(), type.size(), type.alignment());
    }
  }

@ -1145,23 +1140,10 @@ class Executor {
    if (BLI_system_thread_count() <= 1) {
      return false;
    }
-    this->ensure_thread_locals();
    task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
    return true;
  }

-  void ensure_thread_locals()
-  {
-#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
-    if (current_main_thread_ != std::this_thread::get_id()) {
-      BLI_assert_unreachable();
-    }
-#endif
-    if (!thread_locals_) {
-      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
-    }
-  }
-
  /**
   * Allow other threads to steal all the nodes that are currently scheduled on this thread.
   */
@ -1194,12 +1176,12 @@ class Executor {
        [](TaskPool * /*pool*/, void *data) { MEM_delete(static_cast<ScheduledNodes *>(data)); });
  }

-  LinearAllocator<> &get_main_or_local_allocator()
+  LocalAllocator &get_local_allocator()
  {
    if (this->use_multi_threading()) {
-      return thread_locals_->local().allocator;
+      return context_->allocator->local();
    }
-    return main_allocator_;
+    return *context_->allocator;
  }
 };

@ -1248,7 +1230,7 @@ class GraphExecutorLFParams final : public Params {
    OutputState &output_state = node_state_.outputs[index];
    BLI_assert(!output_state.has_been_computed);
    if (output_state.value == nullptr) {
-      LinearAllocator<> &allocator = executor_.get_main_or_local_allocator();
+      LocalAllocator &allocator = executor_.get_local_allocator();
      const CPPType &type = node_.output(index).type();
      output_state.value = allocator.allocate(type.size(), type.alignment());
    }
@ -1297,13 +1279,15 @@ class GraphExecutorLFParams final : public Params {
 */
 inline void Executor::execute_node(const FunctionNode &node,
                                   NodeState &node_state,
-                                   CurrentTask &current_task)
+                                   CurrentTask &current_task,
+                                   LocalAllocator &allocator)
 {
  const LazyFunction &fn = node.function();
  GraphExecutorLFParams node_params{fn, *this, node, node_state, current_task};
  BLI_assert(context_ != nullptr);
  Context fn_context = *context_;
  fn_context.storage = node_state.storage;
+  fn_context.allocator = &allocator;

  if (self_.logger_ != nullptr) {
    self_.logger_->log_before_node_execute(node, node_params, fn_context);
@ -1330,12 +1314,32 @@ inline void Executor::execute_node(const FunctionNode &node,
  }
 }

+void GraphExecutor::preprocess(const Graph &graph, PreprocessData &r_preprocess_data)
+{
+  const Span<const Node *> nodes = graph.nodes();
+  r_preprocess_data.offsets.reinitialize(nodes.size());
+  int offset = 0;
+  for (const int i : nodes.index_range()) {
+    const Node &node = *nodes[i];
+    NodeBufferOffsets &node_offsets = r_preprocess_data.offsets[i];
+    node_offsets.node = offset;
+    offset += sizeof(NodeState);
+    node_offsets.inputs = offset;
+    offset += sizeof(InputState) * node.inputs().size();
+    node_offsets.outputs = offset;
+    offset += sizeof(OutputState) * node.outputs().size();
+  }
+  r_preprocess_data.node_state_buffer_size = offset;
+}
+
 GraphExecutor::GraphExecutor(const Graph &graph,
                             const Span<const OutputSocket *> graph_inputs,
                             const Span<const InputSocket *> graph_outputs,
+                             const PreprocessData &preprocess_data,
                             const Logger *logger,
                             const SideEffectProvider *side_effect_provider)
    : graph_(graph),
+      preprocess_data_(preprocess_data),
      graph_inputs_(graph_inputs),
      graph_outputs_(graph_outputs),
      logger_(logger),
@ -1360,15 +1364,17 @@ void GraphExecutor::execute_impl(Params &params, const Context &context) const
  executor.execute(params, context);
 }

-void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const
+void *GraphExecutor::init_storage(LocalAllocator &allocator) const
 {
-  Executor &executor = *allocator.construct<Executor>(*this).release();
+  Executor &executor = allocator.allocate_new<Executor>(*this);
  return &executor;
 }

-void GraphExecutor::destruct_storage(void *storage) const
+void GraphExecutor::destruct_storage(void *storage, LocalAllocator &allocator) const
 {
-  std::destroy_at(static_cast<Executor *>(storage));
+  Executor *executor = static_cast<Executor *>(storage);
+  executor->destruct_state(allocator);
+  allocator.destruct_free(executor);
 }

 void GraphExecutorLogger::log_socket_value(const Socket &socket,
--- a/source/blender/functions/tests/FN_lazy_function_test.cc
+++ b/source/blender/functions/tests/FN_lazy_function_test.cc
@ -105,7 +105,11 @@ TEST(lazy_function, SideEffects)

  SimpleSideEffectProvider side_effect_provider{{&store_node}};

-  GraphExecutor executor_fn{graph, {&input_node.output(0)}, {}, nullptr, &side_effect_provider};
+  GraphExecutor::PreprocessData preprocess_data;
+  GraphExecutor::preprocess(graph, preprocess_data);
+
+  GraphExecutor executor_fn{
+      graph, {&input_node.output(0)}, {}, preprocess_data, nullptr, &side_effect_provider};
  execute_lazy_function_eagerly(executor_fn, nullptr, std::make_tuple(5), std::make_tuple());

  EXPECT_EQ(dst1, 15);
@ -167,8 +171,11 @@ TEST(lazy_function, GraphWithCycle)

  graph.update_node_indices();

+  GraphExecutor::PreprocessData preprocess_data;
+  GraphExecutor::preprocess(graph, preprocess_data);
+
  GraphExecutor executor_fn{
-      graph, {&input_node.output(0)}, {&output_node.input(0)}, nullptr, nullptr};
+      graph, {&input_node.output(0)}, {&output_node.input(0)}, preprocess_data, nullptr, nullptr};
  int result = 0;
  execute_lazy_function_eagerly(
      executor_fn, nullptr, std::make_tuple(10), std::make_tuple(&result));
--- a/source/blender/modifiers/intern/MOD_nodes.cc
+++ b/source/blender/modifiers/intern/MOD_nodes.cc
@ -1146,8 +1146,12 @@ static GeometrySet compute_geometry(
  blender::nodes::GeometryNodesLazyFunctionLogger lf_logger(lf_graph_info);
  blender::nodes::GeometryNodesLazyFunctionSideEffectProvider lf_side_effect_provider;

-  lf::GraphExecutor graph_executor{
-      lf_graph_info.graph, graph_inputs, graph_outputs, &lf_logger, &lf_side_effect_provider};
+  lf::GraphExecutor graph_executor{lf_graph_info.graph,
+                                   graph_inputs,
+                                   graph_outputs,
+                                   lf_graph_info.graph_preprocess_data,
+                                   &lf_logger,
+                                   &lf_side_effect_provider};

  blender::nodes::GeoNodesModifierData geo_nodes_modifier_data;
  geo_nodes_modifier_data.depsgraph = ctx->depsgraph;
@ -1169,7 +1173,9 @@ static GeometrySet compute_geometry(
  blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
  user_data.compute_context = &modifier_compute_context;

-  blender::LinearAllocator<> allocator;
+  blender::LocalAllocatorSet allocator_set;
+  blender::LocalAllocator &allocator = allocator_set.local();
+
  Vector<GMutablePointer> inputs_to_destruct;

  int input_index = -1;
@ -1212,6 +1218,7 @@ static GeometrySet compute_geometry(
  lf::Context lf_context;
  lf_context.storage = graph_executor.init_storage(allocator);
  lf_context.user_data = &user_data;
+  lf_context.allocator = &allocator;
  lf::BasicParams lf_params{graph_executor,
                            param_inputs,
                            param_outputs,
@ -1219,7 +1226,7 @@ static GeometrySet compute_geometry(
                            param_output_usages,
                            param_set_outputs};
  graph_executor.execute(lf_params, lf_context);
-  graph_executor.destruct_storage(lf_context.storage);
+  graph_executor.destruct_storage(lf_context.storage, allocator);

  for (GMutablePointer &ptr : inputs_to_destruct) {
    ptr.destruct();
@ -1289,6 +1296,7 @@ static void modifyGeometry(ModifierData *md,
                           const ModifierEvalContext *ctx,
                           GeometrySet &geometry_set)
 {
+  SCOPED_TIMER_AVERAGED(__func__);
  NodesModifierData *nmd = reinterpret_cast<NodesModifierData *>(md);
  if (nmd->node_group == nullptr) {
    return;
--- a/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh
+++ b/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh
@ -187,6 +187,7 @@ struct GeometryNodesLazyFunctionGraphInfo {
   * Mappings between the lazy-function graph and the #bNodeTree.
   */
  GeometryNodeLazyFunctionGraphMapping mapping;
+  lf::GraphExecutor::PreprocessData graph_preprocess_data;
  /**
   * Approximate number of nodes in the graph if all sub-graphs were inlined.
   * This can be used as a simple heuristic for the complexity of the node group.
--- a/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
+++ b/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
@ -769,6 +769,7 @@ class LazyFunctionForGroupNode : public LazyFunction {
    graph_executor_.emplace(lf_graph_info.graph,
                            std::move(graph_inputs),
                            std::move(graph_outputs),
+                            lf_graph_info.graph_preprocess_data,
                            &*lf_logger_,
                            &*lf_side_effect_provider_);
  }
@ -805,18 +806,18 @@ class LazyFunctionForGroupNode : public LazyFunction {
    graph_executor_->execute(params, group_context);
  }

-  void *init_storage(LinearAllocator<> &allocator) const override
+  void *init_storage(LocalAllocator &allocator) const override
  {
-    Storage *s = allocator.construct<Storage>().release();
-    s->graph_executor_storage = graph_executor_->init_storage(allocator);
-    return s;
+    Storage &s = allocator.allocate_new<Storage>();
+    s.graph_executor_storage = graph_executor_->init_storage(allocator);
+    return &s;
  }

-  void destruct_storage(void *storage) const override
+  void destruct_storage(void *storage, LocalAllocator &allocator) const override
  {
    Storage *s = static_cast<Storage *>(storage);
-    graph_executor_->destruct_storage(s->graph_executor_storage);
-    std::destroy_at(s);
+    graph_executor_->destruct_storage(s->graph_executor_storage, allocator);
+    allocator.destruct_free(s);
  }

  std::string name() const override
@ -1243,6 +1244,7 @@ struct GeometryNodesLazyFunctionGraphBuilder {

    lf_graph_->update_node_indices();
    lf_graph_info_->num_inline_nodes_approximate += lf_graph_->nodes().size();
+    lf::GraphExecutor::preprocess(*lf_graph_, lf_graph_info_->graph_preprocess_data);
  }

 private: