13 changed files with 548 additions and 134 deletions
--- a/source/blender/blenlib/BLI_local_allocator.hh
+++ b/source/blender/blenlib/BLI_local_allocator.hh
@ -0,0 +1,331 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 #pragma once
 #include <cstddef>
 #include "BLI_allocator.hh"
 #include "BLI_asan.h"
 #include "BLI_enumerable_thread_specific.hh"
 #include "BLI_linear_allocator.hh"
 #include "BLI_map.hh"
 #include "BLI_math_bits.h"
 #include "BLI_stack.hh"
 #include "BLI_utility_mixins.hh"
 #include "BLI_vector.hh"
 // #define BLI_LOCAL_ALLOCATOR_USE_GUARDED
 // #define BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
 namespace blender {
 class LocalAllocatorSet;
 class LocalAllocator;
 class LocalAllocatorPool;
 class LocalAllocatorPool : NonCopyable, NonMovable {
 private:
  Stack<void *> buffers;
  int64_t element_size = -1;
  int64_t alignment = -1;
  friend LocalAllocator;
 };
 class LocalAllocator : NonCopyable, NonMovable {
 private:
  static constexpr int64_t s_alignment = 64;
  static constexpr int64_t s_global_allocation_threshold = 5 * 1024 * 1024;
  LocalAllocatorSet &owner_set_;
  AlignedBuffer<256, 64> initial_buffer_;
  LinearAllocator<> linear_allocator_;
  struct Head {
    int64_t buffer_size;
    int64_t buffer_alignment;
  };
  static_assert(is_power_of_2_constexpr(sizeof(Head)));
  std::array<LocalAllocatorPool, 8> small_buffer_pools_;
  Map<int, std::unique_ptr<LocalAllocatorPool>> large_buffer_pools_;
  friend LocalAllocatorSet;
  LocalAllocator(LocalAllocatorSet &owner_set);
 public:
  ~LocalAllocator();
  bool is_local() const;
  LocalAllocator &local();
  LocalAllocatorSet &owner_set();
  void *allocate(int64_t size, int64_t alignment);
  void deallocate(const void *buffer, int64_t size, int64_t alignment);
  void *allocate(LocalAllocatorPool &pool);
  void deallocate(const void *buffer, LocalAllocatorPool &pool);
  void *allocate_with_head(int64_t size, int64_t alignment);
  void deallocate_with_head(const void *buffer);
  LocalAllocatorPool &get_pool(int64_t size, int64_t alignment);
  template<typename T, typename... Args> T &allocate_new(Args &&...args);
  template<typename T, typename... Args> void destruct_free(const T *value);
  template<typename T> MutableSpan<T> allocate_array(int64_t size);
  template<typename T, typename... Args>
  MutableSpan<T> allocate_new_array(int64_t size, Args &&...args);
  template<typename T> void destruct_free_array(Span<T> data);
  template<typename T> void destruct_free_array(MutableSpan<T> data);
 };
 class LocalAllocatorSet : NonCopyable, NonMovable {
 private:
  threading::EnumerableThreadSpecific<LocalAllocator> allocator_by_thread_;
 #ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
  std::mutex debug_sizes_mutex_;
  Map<const void *, int64_t> debug_sizes_;
 #endif
  friend LocalAllocator;
 public:
  LocalAllocatorSet();
  ~LocalAllocatorSet();
  LocalAllocator &local();
 };
 class ThreadedLocalAllocatorRef {
 private:
  LocalAllocatorSet &allocator_set_;
 public:
  ThreadedLocalAllocatorRef(LocalAllocator &allocator) : allocator_set_(allocator.owner_set())
  {
  }
  void *allocate(const size_t size, const size_t alignment, const char * /*name*/)
  {
    LocalAllocator &allocator = allocator_set_.local();
    return allocator.allocate_with_head(size, alignment);
  }
  void deallocate(void *ptr)
  {
    LocalAllocator &allocator = allocator_set_.local();
    allocator.deallocate_with_head(ptr);
  }
 };
 class LocalAllocatorRef {
 private:
  LocalAllocator &allocator_;
 public:
  LocalAllocatorRef(LocalAllocator &allocator) : allocator_(allocator)
  {
  }
  void *allocate(const size_t size, const size_t alignment, const char * /*name*/)
  {
    return allocator_.allocate_with_head(size, alignment);
  }
  void deallocate(void *ptr)
  {
    allocator_.deallocate_with_head(ptr);
  }
 };
 inline bool LocalAllocator::is_local() const
 {
  return this == &owner_set_.local();
 }
 inline LocalAllocator &LocalAllocator::local()
 {
  return owner_set_.local();
 }
 inline LocalAllocatorSet &LocalAllocator::owner_set()
 {
  return owner_set_;
 }
 inline void *LocalAllocator::allocate(const int64_t size, const int64_t alignment)
 {
  LocalAllocatorPool &pool = this->get_pool(size, alignment);
  BLI_assert(pool.element_size >= size);
  BLI_assert(pool.alignment >= alignment);
  return this->allocate(pool);
 }
 inline void LocalAllocator::deallocate(const void *buffer,
                                       const int64_t size,
                                       const int64_t alignment)
 {
  LocalAllocatorPool &pool = this->get_pool(size, alignment);
  BLI_assert(pool.element_size >= size);
  BLI_assert(pool.alignment >= alignment);
  this->deallocate(buffer, pool);
 }
 inline void *LocalAllocator::allocate(LocalAllocatorPool &pool)
 {
  BLI_assert(this->is_local());
 #ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED
  return MEM_mallocN_aligned(pool.element_size, pool.alignment, __func__);
 #endif
  void *buffer;
  if (!pool.buffers.is_empty()) {
    buffer = pool.buffers.pop();
    BLI_asan_unpoison(buffer, pool.element_size);
  }
  else if (pool.element_size < s_global_allocation_threshold) {
    buffer = linear_allocator_.allocate(pool.element_size, pool.alignment);
  }
  else {
    buffer = MEM_mallocN(pool.element_size, __func__);
  }
 #ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
  {
    std::lock_guard lock{owner_set_.debug_sizes_mutex_};
    owner_set_.debug_sizes_.add_new(buffer, pool.element_size);
  }
 #endif
  return buffer;
 }
 inline void LocalAllocator::deallocate(const void *buffer, LocalAllocatorPool &pool)
 {
  BLI_assert(this->is_local());
 #ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED
  MEM_freeN(const_cast<void *>(buffer));
  return;
 #endif
 #ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
  {
    std::lock_guard lock{owner_set_.debug_sizes_mutex_};
    auto [last_size, last_alignment] = owner_set_.debug_sizes_.pop(buffer);
    if (last_size != size) {
      BLI_assert_unreachable();
    }
    if (last_alignment != alignment) {
      BLI_assert_unreachable();
    }
  }
 #endif
 #ifdef DEBUG
  memset(const_cast<void *>(buffer), -1, pool.element_size);
 #endif
  if (pool.element_size < s_global_allocation_threshold) {
    BLI_asan_poison(buffer, pool.element_size);
    pool.buffers.push(const_cast<void *>(buffer));
  }
  else {
    MEM_freeN(const_cast<void *>(buffer));
  }
 }
 inline LocalAllocatorPool &LocalAllocator::get_pool(const int64_t size, const int64_t alignment)
 {
  BLI_assert(size > 0);
  BLI_assert(alignment <= size);
  BLI_assert(alignment <= s_alignment);
  BLI_assert(is_power_of_2_i(alignment));
  UNUSED_VARS_NDEBUG(alignment);
  BLI_assert(this->is_local());
  if (size <= 64) {
    return small_buffer_pools_[(size - 1) >> 3];
  }
  const int key = bitscan_reverse_uint64(uint64_t(size));
  return *large_buffer_pools_.lookup_or_add_cb(key, [&]() {
    auto pool = std::make_unique<LocalAllocatorPool>();
    pool->element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
    pool->alignment = s_alignment;
    return pool;
  });
 }
 inline void *LocalAllocator::allocate_with_head(int64_t size, int64_t alignment)
 {
  const int64_t buffer_size = size + std::max<int64_t>(alignment, sizeof(Head));
  const int64_t buffer_alignment = std::max<int64_t>(alignment, alignof(Head));
  void *buffer = this->allocate(buffer_size, buffer_alignment);
  Head *head = new (buffer) Head;
  head->buffer_size = buffer_size;
  head->buffer_alignment = buffer_alignment;
  return head + 1;
 }
 inline void LocalAllocator::deallocate_with_head(const void *buffer)
 {
  const Head *head = static_cast<const Head *>(buffer) - 1;
  this->deallocate(head, head->buffer_size, head->buffer_alignment);
 }
 template<typename T, typename... Args> inline T &LocalAllocator::allocate_new(Args &&...args)
 {
  void *buffer = this->allocate(sizeof(T), alignof(T));
  T *value = new (buffer) T(std::forward<Args>(args)...);
  return *value;
 }
 template<typename T, typename... Args> inline void LocalAllocator::destruct_free(const T *value)
 {
  std::destroy_at(value);
  this->deallocate(value, sizeof(T), alignof(T));
 }
 template<typename T> MutableSpan<T> inline LocalAllocator::allocate_array(const int64_t size)
 {
  if (size == 0) {
    return {};
  }
  void *buffer = this->allocate(size * sizeof(T), alignof(T));
  return {static_cast<T *>(buffer), size};
 }
 template<typename T, typename... Args>
 MutableSpan<T> inline LocalAllocator::allocate_new_array(const int64_t size, Args &&...args)
 {
  MutableSpan<T> array = this->allocate_array<T>(size);
  for (const int64_t i : IndexRange(size)) {
    new (&array[i]) T(std::forward<Args>(args)...);
  }
  return array;
 }
 template<typename T> inline void LocalAllocator::destruct_free_array(Span<T> data)
 {
  if (data.is_empty()) {
    return;
  }
  destruct_n(const_cast<T *>(data.data()), data.size());
  this->deallocate(data.data(), data.size_in_bytes(), alignof(T));
 }
 template<typename T> inline void LocalAllocator::destruct_free_array(MutableSpan<T> data)
 {
  this->destruct_free_array(data.as_span());
 }
 inline LocalAllocator &LocalAllocatorSet::local()
 {
  return allocator_by_thread_.local();
 }
 }  // namespace blender
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@ -91,6 +91,7 @@ set(SRC
  intern/lazy_threading.cc
  intern/length_parameterize.cc
  intern/listbase.cc
  intern/local_allocator.cc
  intern/math_base.c
  intern/math_base_inline.c
  intern/math_base_safe_inline.c
@ -256,6 +257,7 @@ set(SRC
  BLI_linklist_stack.h
  BLI_listbase.h
  BLI_listbase_wrapper.hh
  BLI_local_allocator.hh
  BLI_map.hh
  BLI_map_slots.hh
  BLI_math.h
@ -484,6 +486,7 @@ if(WITH_GTESTS)
    tests/BLI_linear_allocator_test.cc
    tests/BLI_linklist_lockfree_test.cc
    tests/BLI_listbase_test.cc
    tests/BLI_local_allocator_test.cc
    tests/BLI_map_test.cc
    tests/BLI_math_base_safe_test.cc
    tests/BLI_math_base_test.cc
--- a/source/blender/blenlib/intern/local_allocator.cc
+++ b/source/blender/blenlib/intern/local_allocator.cc
@ -0,0 +1,26 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 #include "BLI_local_allocator.hh"
 namespace blender {
 LocalAllocatorSet::LocalAllocatorSet()
    : allocator_by_thread_([this]() { return LocalAllocator{*this}; })
 {
 }
 LocalAllocatorSet::~LocalAllocatorSet() = default;
 LocalAllocator::LocalAllocator(LocalAllocatorSet &owner_set) : owner_set_(owner_set)
 {
  linear_allocator_.provide_buffer(initial_buffer_);
  for (const int64_t i : IndexRange(small_buffer_pools_.size())) {
    LocalAllocatorPool &pool = small_buffer_pools_[i];
    pool.element_size = 8 * (i + 1);
    pool.alignment = power_of_2_min_u(pool.element_size);
  }
 }
 LocalAllocator::~LocalAllocator() = default;
 }  // namespace blender
--- a/source/blender/blenlib/tests/BLI_local_allocator_test.cc
+++ b/source/blender/blenlib/tests/BLI_local_allocator_test.cc
@ -0,0 +1,10 @@
 /* SPDX-License-Identifier: Apache-2.0 */
 #include "BLI_local_allocator.hh"
 #include "BLI_strict_flags.h"
 #include "testing/testing.h"
 namespace blender::tests {
 }  // namespace blender::tests
--- a/source/blender/functions/FN_lazy_function.hh
+++ b/source/blender/functions/FN_lazy_function.hh
@ -42,6 +42,7 @@
 #include "BLI_function_ref.hh"
 #include "BLI_generic_pointer.hh"
 #include "BLI_linear_allocator.hh"
 #include "BLI_local_allocator.hh"
 #include "BLI_vector.hh"
 #include <atomic>
@ -98,6 +99,8 @@ struct Context {
   * Custom user data that can be used in the function.
   */
  UserData *user_data;
  LocalAllocator *allocator;
 };
 /**
@ -276,12 +279,12 @@ class LazyFunction {
   * Allocates storage for this function. The storage will be passed to every call to #execute.
   * If the function does not keep track of any state, this does not have to be implemented.
   */
-  virtual void *init_storage(LinearAllocator<> &allocator) const;
+  virtual void *init_storage(LocalAllocator &allocator) const;
  /**
   * Destruct the storage created in #init_storage.
   */
-  virtual void destruct_storage(void *storage) const;
+  virtual void destruct_storage(void *storage, LocalAllocator &allocator) const;
  /**
   * Calls `fn` with the input indices that the given `output_index` may depend on. By default
--- a/source/blender/functions/FN_lazy_function_execute.hh
+++ b/source/blender/functions/FN_lazy_function_execute.hh
@ -85,14 +85,16 @@ inline void execute_lazy_function_eagerly_impl(
      ...);
  output_usages.fill(ValueUsage::Used);
  set_outputs.fill(false);
-  LinearAllocator<> allocator;
+  LocalAllocatorSet allocator_set;
  LocalAllocator &allocator = allocator_set.local();
  Context context;
  context.user_data = user_data;
  context.storage = fn.init_storage(allocator);
  context.allocator = &allocator;
  BasicParams params{
      fn, input_pointers, output_pointers, input_usages, output_usages, set_outputs};
  fn.execute(params, context);
-  fn.destruct_storage(context.storage);
+  fn.destruct_storage(context.storage, allocator);
  /* Make sure all outputs have been computed.  */
  BLI_assert(!Span<bool>(set_outputs).contains(false));
--- a/source/blender/functions/FN_lazy_function_graph_executor.hh
+++ b/source/blender/functions/FN_lazy_function_graph_executor.hh
@ -59,11 +59,23 @@ class GraphExecutor : public LazyFunction {
  using Logger = GraphExecutorLogger;
  using SideEffectProvider = GraphExecutorSideEffectProvider;
  struct NodeBufferOffsets {
    int node;
    int inputs;
    int outputs;
  };
  struct PreprocessData {
    Array<NodeBufferOffsets> offsets;
    int node_state_buffer_size;
  };
 private:
  /**
   * The graph that is evaluated.
   */
  const Graph &graph_;
  const PreprocessData &preprocess_data_;
  /**
   * Input and output sockets of the entire graph.
   */
@ -85,11 +97,14 @@ class GraphExecutor : public LazyFunction {
  GraphExecutor(const Graph &graph,
                Span<const OutputSocket *> graph_inputs,
                Span<const InputSocket *> graph_outputs,
                const PreprocessData &preprocess_data,
                const Logger *logger,
                const SideEffectProvider *side_effect_provider);
-  void *init_storage(LinearAllocator<> &allocator) const override;
+  void *init_storage(LocalAllocator &allocator) const override;
-  void destruct_storage(void *storage) const override;
+  void destruct_storage(void *storage, LocalAllocator &allocator) const override;
  static void preprocess(const Graph &graph, PreprocessData &r_preprocess_data);
 private:
  void execute_impl(Params &params, const Context &context) const override;
--- a/source/blender/functions/intern/lazy_function.cc
+++ b/source/blender/functions/intern/lazy_function.cc
@ -25,12 +25,12 @@ std::string LazyFunction::output_name(int index) const
  return outputs_[index].debug_name;
 }
-void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const
+void *LazyFunction::init_storage(LocalAllocator & /*allocator*/) const
 {
  return nullptr;
 }
-void LazyFunction::destruct_storage(void *storage) const
+void LazyFunction::destruct_storage(void *storage, LocalAllocator & /*allocator*/) const
 {
  BLI_assert(storage == nullptr);
  UNUSED_VARS_NDEBUG(storage);
--- a/source/blender/functions/intern/lazy_function_graph_executor.cc
+++ b/source/blender/functions/intern/lazy_function_graph_executor.cc
@ -75,7 +75,7 @@ enum class NodeScheduleState {
  RunningAndRescheduled,
 };
-struct InputState {
+struct alignas(8) InputState {
  /**
   * Value of this input socket. By default, the value is empty. When other nodes are done
   * computing their outputs, the computed values will be forwarded to linked input sockets. The
@ -97,7 +97,7 @@ struct InputState {
  bool was_ready_for_execution = false;
 };
-struct OutputState {
+struct alignas(8) OutputState {
  /**
   * Keeps track of how the output value is used. If a connected input becomes used, this output
   * has to become used as well. The output becomes unused when it is used by no input socket
@ -127,7 +127,7 @@ struct OutputState {
  void *value = nullptr;
 };
-struct NodeState {
+struct alignas(8) NodeState {
  /**
   * Needs to be locked when any data in this state is accessed that is not explicitly marked as
   * not needing the lock.
@ -271,7 +271,7 @@ class Executor {
  /**
   * State of every node, indexed by #Node::index_in_graph.
   */
-  Array<NodeState *> node_states_;
+  MutableSpan<NodeState *> node_states_;
  /**
   * Parameters provided by the caller. This is always non-null, while a node is running.
   */
@ -285,15 +285,7 @@ class Executor {
 #ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
  std::thread::id current_main_thread_;
 #endif
-  /**
+
   * A separate linear allocator for every thread. We could potentially reuse some memory, but that
   * doesn't seem worth it yet.
   */
  struct ThreadLocalData {
    LinearAllocator<> allocator;
  };
  std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
  LinearAllocator<> main_allocator_;
  /**
   * Set to false when the first execution ends.
   */
@ -308,18 +300,25 @@ class Executor {
    BLI_assert(self_.graph_.node_indices_are_valid());
  }
-  ~Executor()
+  void destruct_state(LocalAllocator &allocator)
  {
    if (TaskPool *task_pool = task_pool_.load()) {
      BLI_task_pool_free(task_pool);
    }
    threading::parallel_for(node_states_.index_range(), 1024, [&](const IndexRange range) {
      LocalAllocator &local_allocator = allocator.local();
      for (const int node_index : range) {
        const Node &node = *self_.graph_.nodes()[node_index];
        NodeState &node_state = *node_states_[node_index];
-        this->destruct_node_state(node, node_state);
+        if (!node_state.node_has_finished) {
          this->destruct_node_data(node, node_state, local_allocator);
        }
        std::destroy_at(&node_state);
      }
    });
    allocator.deallocate(
        node_states_[0], self_.preprocess_data_.node_state_buffer_size, alignof(NodeState));
    allocator.destruct_free_array(node_states_);
  }
  /**
@ -364,7 +363,7 @@ class Executor {
        }
      }
-      this->initialize_static_value_usages(side_effect_nodes);
+      this->initialize_static_value_usages(side_effect_nodes, this->get_local_allocator());
      this->schedule_side_effect_nodes(side_effect_nodes, current_task);
    }
@ -382,54 +381,41 @@ class Executor {
  void initialize_node_states()
  {
    Span<const Node *> nodes = self_.graph_.nodes();
-    node_states_.reinitialize(nodes.size());
+    node_states_ = context_->allocator->allocate_array<NodeState *>(nodes.size());
-    auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
+    void *node_states_buffer = context_->allocator->allocate(
-      for (const int i : range) {
+        self_.preprocess_data_.node_state_buffer_size, alignof(NodeState));
-        const Node &node = *nodes[i];
+
-        NodeState &node_state = *allocator.construct<NodeState>().release();
+    for (const int i : nodes.index_range()) {
-        node_states_[i] = &node_state;
+      const Node &node = *nodes[i];
-        this->construct_initial_node_state(allocator, node, node_state);
+      const GraphExecutor::NodeBufferOffsets &node_offsets = self_.preprocess_data_.offsets[i];
-      }
+      void *state_buffer = POINTER_OFFSET(node_states_buffer, node_offsets.node);
-    };
+      NodeState *node_state = new (state_buffer) NodeState();
-    if (nodes.size() <= 256) {
+      node_state->inputs = {
-      construct_node_range(nodes.index_range(), main_allocator_);
+          static_cast<InputState *>(POINTER_OFFSET(node_states_buffer, node_offsets.inputs)),
-    }
+          node.inputs().size()};
-    else {
+      node_state->outputs = {
-      this->ensure_thread_locals();
+          static_cast<OutputState *>(POINTER_OFFSET(node_states_buffer, node_offsets.outputs)),
-      /* Construct all node states in parallel. */
+          node.outputs().size()};
-      threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
+      default_construct_n(node_state->inputs.data(), node_state->inputs.size());
-        LinearAllocator<> &allocator = thread_locals_->local().allocator;
+      default_construct_n(node_state->outputs.data(), node_state->outputs.size());
-        construct_node_range(range, allocator);
+      node_states_[i] = node_state;
      });
    }
  }
-  void construct_initial_node_state(LinearAllocator<> &allocator,
+  void destruct_node_data(const Node &node, NodeState &node_state, LocalAllocator &allocator)
                                    const Node &node,
                                    NodeState &node_state)
  {
    const Span<const InputSocket *> node_inputs = node.inputs();
    const Span<const OutputSocket *> node_outputs = node.outputs();
    node_state.inputs = allocator.construct_array<InputState>(node_inputs.size());
    node_state.outputs = allocator.construct_array<OutputState>(node_outputs.size());
  }
  void destruct_node_state(const Node &node, NodeState &node_state)
  {
    if (node.is_function()) {
      const LazyFunction &fn = static_cast<const FunctionNode &>(node).function();
      if (node_state.storage != nullptr) {
-        fn.destruct_storage(node_state.storage);
+        fn.destruct_storage(node_state.storage, allocator);
      }
    }
    for (const int i : node.inputs().index_range()) {
      InputState &input_state = node_state.inputs[i];
      const InputSocket &input_socket = node.input(i);
-      this->destruct_input_value_if_exists(input_state, input_socket.type());
+      this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
    }
    std::destroy_at(&node_state);
  }
  /**
@ -453,7 +439,7 @@ class Executor {
          this->set_input_required(locked_node, socket);
        }
        else {
-          this->set_input_unused(locked_node, socket);
+          this->set_input_unused(locked_node, socket, this->get_local_allocator());
        }
      });
    }
@ -500,13 +486,14 @@ class Executor {
   * Most importantly, this function initializes `InputState.usage` and
   * `OutputState.potential_target_sockets`.
   */
-  void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes)
+  void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes,
                                      LocalAllocator &allocator)
  {
    const Span<const Node *> all_nodes = self_.graph_.nodes();
    /* Used for a search through all nodes that outputs depend on. */
-    Stack<const Node *> reachable_nodes_to_check;
+    Stack<const Node *, 16, LocalAllocatorRef> reachable_nodes_to_check{allocator};
-    Array<bool> reachable_node_flags(all_nodes.size(), false);
+    Array<bool, 16, LocalAllocatorRef> reachable_node_flags{all_nodes.size(), false, allocator};
    /* Graph outputs are always reachable. */
    for (const InputSocket *socket : self_.graph_outputs_) {
@ -586,7 +573,7 @@ class Executor {
  void forward_newly_provided_inputs(CurrentTask &current_task)
  {
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
    for (const int graph_input_index : self_.graph_inputs_.index_range()) {
      std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
      if (was_loaded.load()) {
@ -605,7 +592,7 @@ class Executor {
  }
  void forward_newly_provided_input(CurrentTask &current_task,
-                                    LinearAllocator<> &allocator,
+                                    LocalAllocator &allocator,
                                    const int graph_input_index,
                                    void *input_data)
  {
@ -621,7 +608,6 @@ class Executor {
    const Node &node = socket.node();
    const int index_in_node = socket.index();
    NodeState &node_state = *node_states_[node.index_in_graph()];
    OutputState &output_state = node_state.outputs[index_in_node];
    /* The notified output socket might be an input of the entire graph. In this case, notify the
     * caller that the input is required. */
@ -640,12 +626,13 @@ class Executor {
        return;
      }
      this->forward_newly_provided_input(
-          current_task, this->get_main_or_local_allocator(), graph_input_index, input_data);
+          current_task, this->get_local_allocator(), graph_input_index, input_data);
      return;
    }
    BLI_assert(node.is_function());
    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
      OutputState &output_state = node_state.outputs[index_in_node];
      if (output_state.usage == ValueUsage::Used) {
        return;
      }
@ -659,9 +646,9 @@ class Executor {
    const Node &node = socket.node();
    const int index_in_node = socket.index();
    NodeState &node_state = *node_states_[node.index_in_graph()];
    OutputState &output_state = node_state.outputs[index_in_node];
    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
      OutputState &output_state = node_state.outputs[index_in_node];
      output_state.potential_target_sockets -= 1;
      if (output_state.potential_target_sockets == 0) {
        BLI_assert(output_state.usage != ValueUsage::Unused);
@ -760,7 +747,7 @@ class Executor {
  void run_node_task(const FunctionNode &node, CurrentTask &current_task)
  {
    NodeState &node_state = *node_states_[node.index_in_graph()];
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
    const LazyFunction &fn = node.function();
    bool node_needs_execution = false;
@ -799,6 +786,7 @@ class Executor {
        node_state.always_used_inputs_requested = true;
      }
      const bool allow_missing_requested_inputs = fn.allow_missing_requested_inputs();
      for (const int input_index : node_state.inputs.index_range()) {
        InputState &input_state = node_state.inputs[input_index];
        if (input_state.was_ready_for_execution) {
@ -808,7 +796,11 @@ class Executor {
          input_state.was_ready_for_execution = true;
          continue;
        }
-        if (!fn.allow_missing_requested_inputs()) {
+        const InputSocket &socket = node.input(input_index);
        if (socket.origin() == nullptr) {
          continue;
        }
        if (!allow_missing_requested_inputs) {
          if (input_state.usage == ValueUsage::Used) {
            return;
          }
@ -848,7 +840,7 @@ class Executor {
      /* Importantly, the node must not be locked when it is executed. That would result in locks
       * being hold very long in some cases and results in multiple locks being hold by the same
       * thread in the same graph which can lead to deadlocks. */
-      this->execute_node(node, node_state, current_task);
+      this->execute_node(node, node_state, current_task, allocator);
    }
    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
@ -857,7 +849,7 @@ class Executor {
        this->assert_expected_outputs_have_been_computed(locked_node);
      }
 #endif
-      this->finish_node_if_possible(locked_node);
+      this->finish_node_if_possible(locked_node, allocator);
      const bool reschedule_requested = node_state.schedule_state ==
                                        NodeScheduleState::RunningAndRescheduled;
      node_state.schedule_state = NodeScheduleState::NotScheduled;
@ -895,7 +887,7 @@ class Executor {
    }
  }
-  void finish_node_if_possible(LockedNode &locked_node)
+  void finish_node_if_possible(LockedNode &locked_node, LocalAllocator &allocator)
  {
    const Node &node = locked_node.node;
    NodeState &node_state = locked_node.node_state;
@ -923,44 +915,44 @@ class Executor {
      const InputSocket &input_socket = node.input(input_index);
      InputState &input_state = node_state.inputs[input_index];
      if (input_state.usage == ValueUsage::Maybe) {
-        this->set_input_unused(locked_node, input_socket);
+        this->set_input_unused(locked_node, input_socket, allocator);
      }
      else if (input_state.usage == ValueUsage::Used) {
        this->destruct_input_value_if_exists(input_state, input_socket.type());
      }
    }
-    if (node_state.storage != nullptr) {
+    this->destruct_node_data(node, node_state, allocator);
      if (node.is_function()) {
        const FunctionNode &fn_node = static_cast<const FunctionNode &>(node);
        fn_node.function().destruct_storage(node_state.storage);
      }
      node_state.storage = nullptr;
    }
  }
-  void destruct_input_value_if_exists(InputState &input_state, const CPPType &type)
+  void destruct_input_value_if_exists(InputState &input_state,
                                      const CPPType &type,
                                      LocalAllocator &allocator)
  {
    if (input_state.value != nullptr) {
      type.destruct(input_state.value);
      allocator.deallocate(input_state.value, type.size(), type.alignment());
      input_state.value = nullptr;
    }
  }
-  void execute_node(const FunctionNode &node, NodeState &node_state, CurrentTask &current_task);
+  void execute_node(const FunctionNode &node,
                    NodeState &node_state,
                    CurrentTask &current_task,
                    LocalAllocator &allocator);
  void set_input_unused_during_execution(const Node &node,
                                         NodeState &node_state,
                                         const int input_index,
                                         CurrentTask &current_task)
  {
    LocalAllocator &allocator = this->get_local_allocator();
    const InputSocket &input_socket = node.input(input_index);
    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
-      this->set_input_unused(locked_node, input_socket);
+      this->set_input_unused(locked_node, input_socket, allocator);
    });
  }
-  void set_input_unused(LockedNode &locked_node, const InputSocket &input_socket)
+  void set_input_unused(LockedNode &locked_node,
                        const InputSocket &input_socket,
                        LocalAllocator &allocator)
  {
    NodeState &node_state = locked_node.node_state;
    const int input_index = input_socket.index();
@ -972,7 +964,7 @@ class Executor {
    }
    input_state.usage = ValueUsage::Unused;
-    this->destruct_input_value_if_exists(input_state, input_socket.type());
+    this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
    if (input_state.was_ready_for_execution) {
      return;
    }
@ -1026,7 +1018,7 @@ class Executor {
                                      CurrentTask &current_task)
  {
    BLI_assert(value_to_forward.get() != nullptr);
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
    const CPPType &type = *value_to_forward.type();
    if (self_.logger_ != nullptr) {
@ -1038,17 +1030,7 @@ class Executor {
      const Node &target_node = target_socket->node();
      NodeState &node_state = *node_states_[target_node.index_in_graph()];
      const int input_index = target_socket->index();
      InputState &input_state = node_state.inputs[input_index];
      const bool is_last_target = target_socket == targets.last();
 #ifdef DEBUG
      if (input_state.value != nullptr) {
        if (self_.logger_ != nullptr) {
          self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_);
        }
        BLI_assert_unreachable();
      }
 #endif
      BLI_assert(!input_state.was_ready_for_execution);
      BLI_assert(target_socket->type() == type);
      BLI_assert(target_socket->origin() == &from_socket);
@ -1072,6 +1054,18 @@ class Executor {
        continue;
      }
      this->with_locked_node(target_node, node_state, current_task, [&](LockedNode &locked_node) {
        InputState &input_state = node_state.inputs[input_index];
 #ifdef DEBUG
        if (input_state.value != nullptr) {
          if (self_.logger_ != nullptr) {
            self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_);
          }
          BLI_assert_unreachable();
        }
 #endif
        BLI_assert(!input_state.was_ready_for_execution);
        if (input_state.usage == ValueUsage::Unused) {
          return;
        }
@ -1089,6 +1083,7 @@ class Executor {
    }
    if (value_to_forward.get() != nullptr) {
      value_to_forward.destruct();
      allocator.deallocate(value_to_forward.get(), type.size(), type.alignment());
    }
  }
@ -1145,23 +1140,10 @@ class Executor {
    if (BLI_system_thread_count() <= 1) {
      return false;
    }
    this->ensure_thread_locals();
    task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
    return true;
  }
  void ensure_thread_locals()
  {
 #ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
    if (current_main_thread_ != std::this_thread::get_id()) {
      BLI_assert_unreachable();
    }
 #endif
    if (!thread_locals_) {
      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
    }
  }
  /**
   * Allow other threads to steal all the nodes that are currently scheduled on this thread.
   */
@ -1194,12 +1176,12 @@ class Executor {
        [](TaskPool * /*pool*/, void *data) { MEM_delete(static_cast<ScheduledNodes *>(data)); });
  }
-  LinearAllocator<> &get_main_or_local_allocator()
+  LocalAllocator &get_local_allocator()
  {
    if (this->use_multi_threading()) {
-      return thread_locals_->local().allocator;
+      return context_->allocator->local();
    }
-    return main_allocator_;
+    return *context_->allocator;
  }
 };
@ -1248,7 +1230,7 @@ class GraphExecutorLFParams final : public Params {
    OutputState &output_state = node_state_.outputs[index];
    BLI_assert(!output_state.has_been_computed);
    if (output_state.value == nullptr) {
-      LinearAllocator<> &allocator = executor_.get_main_or_local_allocator();
+      LocalAllocator &allocator = executor_.get_local_allocator();
      const CPPType &type = node_.output(index).type();
      output_state.value = allocator.allocate(type.size(), type.alignment());
    }
@ -1297,13 +1279,15 @@ class GraphExecutorLFParams final : public Params {
 */
 inline void Executor::execute_node(const FunctionNode &node,
                                   NodeState &node_state,
-                                   CurrentTask &current_task)
+                                   CurrentTask &current_task,
                                   LocalAllocator &allocator)
 {
  const LazyFunction &fn = node.function();
  GraphExecutorLFParams node_params{fn, *this, node, node_state, current_task};
  BLI_assert(context_ != nullptr);
  Context fn_context = *context_;
  fn_context.storage = node_state.storage;
  fn_context.allocator = &allocator;
  if (self_.logger_ != nullptr) {
    self_.logger_->log_before_node_execute(node, node_params, fn_context);
@ -1330,12 +1314,32 @@ inline void Executor::execute_node(const FunctionNode &node,
  }
 }
 void GraphExecutor::preprocess(const Graph &graph, PreprocessData &r_preprocess_data)
 {
  const Span<const Node *> nodes = graph.nodes();
  r_preprocess_data.offsets.reinitialize(nodes.size());
  int offset = 0;
  for (const int i : nodes.index_range()) {
    const Node &node = *nodes[i];
    NodeBufferOffsets &node_offsets = r_preprocess_data.offsets[i];
    node_offsets.node = offset;
    offset += sizeof(NodeState);
    node_offsets.inputs = offset;
    offset += sizeof(InputState) * node.inputs().size();
    node_offsets.outputs = offset;
    offset += sizeof(OutputState) * node.outputs().size();
  }
  r_preprocess_data.node_state_buffer_size = offset;
 }
 GraphExecutor::GraphExecutor(const Graph &graph,
                             const Span<const OutputSocket *> graph_inputs,
                             const Span<const InputSocket *> graph_outputs,
                             const PreprocessData &preprocess_data,
                             const Logger *logger,
                             const SideEffectProvider *side_effect_provider)
    : graph_(graph),
      preprocess_data_(preprocess_data),
      graph_inputs_(graph_inputs),
      graph_outputs_(graph_outputs),
      logger_(logger),
@ -1360,15 +1364,17 @@ void GraphExecutor::execute_impl(Params &params, const Context &context) const
  executor.execute(params, context);
 }
-void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const
+void *GraphExecutor::init_storage(LocalAllocator &allocator) const
 {
-  Executor &executor = *allocator.construct<Executor>(*this).release();
+  Executor &executor = allocator.allocate_new<Executor>(*this);
  return &executor;
 }
-void GraphExecutor::destruct_storage(void *storage) const
+void GraphExecutor::destruct_storage(void *storage, LocalAllocator &allocator) const
 {
-  std::destroy_at(static_cast<Executor *>(storage));
+  Executor *executor = static_cast<Executor *>(storage);
  executor->destruct_state(allocator);
  allocator.destruct_free(executor);
 }
 void GraphExecutorLogger::log_socket_value(const Socket &socket,
--- a/source/blender/functions/tests/FN_lazy_function_test.cc
+++ b/source/blender/functions/tests/FN_lazy_function_test.cc
@ -105,7 +105,11 @@ TEST(lazy_function, SideEffects)
  SimpleSideEffectProvider side_effect_provider{{&store_node}};
-  GraphExecutor executor_fn{graph, {&input_node.output(0)}, {}, nullptr, &side_effect_provider};
+  GraphExecutor::PreprocessData preprocess_data;
  GraphExecutor::preprocess(graph, preprocess_data);
  GraphExecutor executor_fn{
      graph, {&input_node.output(0)}, {}, preprocess_data, nullptr, &side_effect_provider};
  execute_lazy_function_eagerly(executor_fn, nullptr, std::make_tuple(5), std::make_tuple());
  EXPECT_EQ(dst1, 15);
@ -167,8 +171,11 @@ TEST(lazy_function, GraphWithCycle)
  graph.update_node_indices();
  GraphExecutor::PreprocessData preprocess_data;
  GraphExecutor::preprocess(graph, preprocess_data);
  GraphExecutor executor_fn{
-      graph, {&input_node.output(0)}, {&output_node.input(0)}, nullptr, nullptr};
+      graph, {&input_node.output(0)}, {&output_node.input(0)}, preprocess_data, nullptr, nullptr};
  int result = 0;
  execute_lazy_function_eagerly(
      executor_fn, nullptr, std::make_tuple(10), std::make_tuple(&result));
--- a/source/blender/modifiers/intern/MOD_nodes.cc
+++ b/source/blender/modifiers/intern/MOD_nodes.cc
@ -1146,8 +1146,12 @@ static GeometrySet compute_geometry(
  blender::nodes::GeometryNodesLazyFunctionLogger lf_logger(lf_graph_info);
  blender::nodes::GeometryNodesLazyFunctionSideEffectProvider lf_side_effect_provider;
-  lf::GraphExecutor graph_executor{
+  lf::GraphExecutor graph_executor{lf_graph_info.graph,
-      lf_graph_info.graph, graph_inputs, graph_outputs, &lf_logger, &lf_side_effect_provider};
+                                   graph_inputs,
                                   graph_outputs,
                                   lf_graph_info.graph_preprocess_data,
                                   &lf_logger,
                                   &lf_side_effect_provider};
  blender::nodes::GeoNodesModifierData geo_nodes_modifier_data;
  geo_nodes_modifier_data.depsgraph = ctx->depsgraph;
@ -1169,7 +1173,9 @@ static GeometrySet compute_geometry(
  blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
  user_data.compute_context = &modifier_compute_context;
-  blender::LinearAllocator<> allocator;
+  blender::LocalAllocatorSet allocator_set;
  blender::LocalAllocator &allocator = allocator_set.local();
  Vector<GMutablePointer> inputs_to_destruct;
  int input_index = -1;
@ -1212,6 +1218,7 @@ static GeometrySet compute_geometry(
  lf::Context lf_context;
  lf_context.storage = graph_executor.init_storage(allocator);
  lf_context.user_data = &user_data;
  lf_context.allocator = &allocator;
  lf::BasicParams lf_params{graph_executor,
                            param_inputs,
                            param_outputs,
@ -1219,7 +1226,7 @@ static GeometrySet compute_geometry(
                            param_output_usages,
                            param_set_outputs};
  graph_executor.execute(lf_params, lf_context);
-  graph_executor.destruct_storage(lf_context.storage);
+  graph_executor.destruct_storage(lf_context.storage, allocator);
  for (GMutablePointer &ptr : inputs_to_destruct) {
    ptr.destruct();
@ -1289,6 +1296,7 @@ static void modifyGeometry(ModifierData *md,
                           const ModifierEvalContext *ctx,
                           GeometrySet &geometry_set)
 {
  SCOPED_TIMER_AVERAGED(__func__);
  NodesModifierData *nmd = reinterpret_cast<NodesModifierData *>(md);
  if (nmd->node_group == nullptr) {
    return;
--- a/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh
+++ b/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh
@ -187,6 +187,7 @@ struct GeometryNodesLazyFunctionGraphInfo {
   * Mappings between the lazy-function graph and the #bNodeTree.
   */
  GeometryNodeLazyFunctionGraphMapping mapping;
  lf::GraphExecutor::PreprocessData graph_preprocess_data;
  /**
   * Approximate number of nodes in the graph if all sub-graphs were inlined.
   * This can be used as a simple heuristic for the complexity of the node group.
--- a/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
+++ b/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
@ -769,6 +769,7 @@ class LazyFunctionForGroupNode : public LazyFunction {
    graph_executor_.emplace(lf_graph_info.graph,
                            std::move(graph_inputs),
                            std::move(graph_outputs),
                            lf_graph_info.graph_preprocess_data,
                            &*lf_logger_,
                            &*lf_side_effect_provider_);
  }
@ -805,18 +806,18 @@ class LazyFunctionForGroupNode : public LazyFunction {
    graph_executor_->execute(params, group_context);
  }
-  void *init_storage(LinearAllocator<> &allocator) const override
+  void *init_storage(LocalAllocator &allocator) const override
  {
-    Storage *s = allocator.construct<Storage>().release();
+    Storage &s = allocator.allocate_new<Storage>();
-    s->graph_executor_storage = graph_executor_->init_storage(allocator);
+    s.graph_executor_storage = graph_executor_->init_storage(allocator);
-    return s;
+    return &s;
  }
-  void destruct_storage(void *storage) const override
+  void destruct_storage(void *storage, LocalAllocator &allocator) const override
  {
    Storage *s = static_cast<Storage *>(storage);
-    graph_executor_->destruct_storage(s->graph_executor_storage);
+    graph_executor_->destruct_storage(s->graph_executor_storage, allocator);
-    std::destroy_at(s);
+    allocator.destruct_free(s);
  }
  std::string name() const override
@ -1243,6 +1244,7 @@ struct GeometryNodesLazyFunctionGraphBuilder {
    lf_graph_->update_node_indices();
    lf_graph_info_->num_inline_nodes_approximate += lf_graph_->nodes().size();
    lf::GraphExecutor::preprocess(*lf_graph_, lf_graph_info_->graph_preprocess_data);
  }
 private: