diff --git a/source/blender/blenlib/BLI_local_allocator.hh b/source/blender/blenlib/BLI_local_allocator.hh
new file mode 100644
index 00000000000..efc2c7f616d
--- /dev/null
+++ b/source/blender/blenlib/BLI_local_allocator.hh
@@ -0,0 +1,331 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#pragma once
+
+#include <cstddef>
+
+#include "BLI_allocator.hh"
+#include "BLI_asan.h"
+#include "BLI_enumerable_thread_specific.hh"
+#include "BLI_linear_allocator.hh"
+#include "BLI_map.hh"
+#include "BLI_math_bits.h"
+#include "BLI_stack.hh"
+#include "BLI_utility_mixins.hh"
+#include "BLI_vector.hh"
+
+// #define BLI_LOCAL_ALLOCATOR_USE_GUARDED
+// #define BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
+
+namespace blender {
+
+class LocalAllocatorSet;
+class LocalAllocator;
+class LocalAllocatorPool;
+
+class LocalAllocatorPool : NonCopyable, NonMovable {
+ private:
+  Stack<void *> buffers;
+  int64_t element_size = -1;
+  int64_t alignment = -1;
+
+  friend LocalAllocator;
+};
+
+class LocalAllocator : NonCopyable, NonMovable {
+ private:
+  static constexpr int64_t s_alignment = 64;
+  static constexpr int64_t s_global_allocation_threshold = 5 * 1024 * 1024;
+  LocalAllocatorSet &owner_set_;
+  AlignedBuffer<256, 64> initial_buffer_;
+  LinearAllocator<> linear_allocator_;
+
+  struct Head {
+    int64_t buffer_size;
+    int64_t buffer_alignment;
+  };
+  static_assert(is_power_of_2_constexpr(sizeof(Head)));
+
+  std::array<LocalAllocatorPool, 8> small_buffer_pools_;
+  Map<int, std::unique_ptr<LocalAllocatorPool>> large_buffer_pools_;
+
+  friend LocalAllocatorSet;
+
+  LocalAllocator(LocalAllocatorSet &owner_set);
+
+ public:
+  ~LocalAllocator();
+
+  bool is_local() const;
+  LocalAllocator &local();
+  LocalAllocatorSet &owner_set();
+
+  void *allocate(int64_t size, int64_t alignment);
+  void deallocate(const void *buffer, int64_t size, int64_t alignment);
+
+  void *allocate(LocalAllocatorPool &pool);
+  void deallocate(const void *buffer, LocalAllocatorPool &pool);
+
+  void *allocate_with_head(int64_t size, int64_t alignment);
+  void deallocate_with_head(const void *buffer);
+
+  LocalAllocatorPool &get_pool(int64_t size, int64_t alignment);
+
+  template<typename T, typename... Args> T &allocate_new(Args &&...args);
+  template<typename T, typename... Args> void destruct_free(const T *value);
+  template<typename T> MutableSpan<T> allocate_array(int64_t size);
+  template<typename T, typename... Args>
+  MutableSpan<T> allocate_new_array(int64_t size, Args &&...args);
+  template<typename T> void destruct_free_array(Span<T> data);
+  template<typename T> void destruct_free_array(MutableSpan<T> data);
+};
+
+class LocalAllocatorSet : NonCopyable, NonMovable {
+ private:
+  threading::EnumerableThreadSpecific<LocalAllocator> allocator_by_thread_;
+
+#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
+  std::mutex debug_sizes_mutex_;
+  Map<const void *, int64_t> debug_sizes_;
+#endif
+
+  friend LocalAllocator;
+
+ public:
+  LocalAllocatorSet();
+  ~LocalAllocatorSet();
+
+  LocalAllocator &local();
+};
+
+class ThreadedLocalAllocatorRef {
+ private:
+  LocalAllocatorSet &allocator_set_;
+
+ public:
+  ThreadedLocalAllocatorRef(LocalAllocator &allocator) : allocator_set_(allocator.owner_set())
+  {
+  }
+
+  void *allocate(const size_t size, const size_t alignment, const char * /*name*/)
+  {
+    LocalAllocator &allocator = allocator_set_.local();
+    return allocator.allocate_with_head(size, alignment);
+  }
+
+  void deallocate(void *ptr)
+  {
+    LocalAllocator &allocator = allocator_set_.local();
+    allocator.deallocate_with_head(ptr);
+  }
+};
+
+class LocalAllocatorRef {
+ private:
+  LocalAllocator &allocator_;
+
+ public:
+  LocalAllocatorRef(LocalAllocator &allocator) : allocator_(allocator)
+  {
+  }
+
+  void *allocate(const size_t size, const size_t alignment, const char * /*name*/)
+  {
+    return allocator_.allocate_with_head(size, alignment);
+  }
+
+  void deallocate(void *ptr)
+  {
+    allocator_.deallocate_with_head(ptr);
+  }
+};
+
+inline bool LocalAllocator::is_local() const
+{
+  return this == &owner_set_.local();
+}
+
+inline LocalAllocator &LocalAllocator::local()
+{
+  return owner_set_.local();
+}
+
+inline LocalAllocatorSet &LocalAllocator::owner_set()
+{
+  return owner_set_;
+}
+
+inline void *LocalAllocator::allocate(const int64_t size, const int64_t alignment)
+{
+  LocalAllocatorPool &pool = this->get_pool(size, alignment);
+  BLI_assert(pool.element_size >= size);
+  BLI_assert(pool.alignment >= alignment);
+
+  return this->allocate(pool);
+}
+
+inline void LocalAllocator::deallocate(const void *buffer,
+                                       const int64_t size,
+                                       const int64_t alignment)
+{
+  LocalAllocatorPool &pool = this->get_pool(size, alignment);
+  BLI_assert(pool.element_size >= size);
+  BLI_assert(pool.alignment >= alignment);
+
+  this->deallocate(buffer, pool);
+}
+
+inline void *LocalAllocator::allocate(LocalAllocatorPool &pool)
+{
+  BLI_assert(this->is_local());
+
+#ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED
+  return MEM_mallocN_aligned(pool.element_size, pool.alignment, __func__);
+#endif
+
+  void *buffer;
+  if (!pool.buffers.is_empty()) {
+    buffer = pool.buffers.pop();
+    BLI_asan_unpoison(buffer, pool.element_size);
+  }
+  else if (pool.element_size < s_global_allocation_threshold) {
+    buffer = linear_allocator_.allocate(pool.element_size, pool.alignment);
+  }
+  else {
+    buffer = MEM_mallocN(pool.element_size, __func__);
+  }
+
+#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
+  {
+    std::lock_guard lock{owner_set_.debug_sizes_mutex_};
+    owner_set_.debug_sizes_.add_new(buffer, pool.element_size);
+  }
+#endif
+
+  return buffer;
+}
+
+inline void LocalAllocator::deallocate(const void *buffer, LocalAllocatorPool &pool)
+{
+  BLI_assert(this->is_local());
+
+#ifdef BLI_LOCAL_ALLOCATOR_USE_GUARDED
+  MEM_freeN(const_cast<void *>(buffer));
+  return;
+#endif
+
+#ifdef BLI_LOCAL_ALLOCATOR_DEBUG_SIZES
+  {
+    std::lock_guard lock{owner_set_.debug_sizes_mutex_};
+    auto [last_size, last_alignment] = owner_set_.debug_sizes_.pop(buffer);
+    if (last_size != size) {
+      BLI_assert_unreachable();
+    }
+    if (last_alignment != alignment) {
+      BLI_assert_unreachable();
+    }
+  }
+#endif
+
+#ifdef DEBUG
+  memset(const_cast<void *>(buffer), -1, pool.element_size);
+#endif
+
+  if (pool.element_size < s_global_allocation_threshold) {
+    BLI_asan_poison(buffer, pool.element_size);
+    pool.buffers.push(const_cast<void *>(buffer));
+  }
+  else {
+    MEM_freeN(const_cast<void *>(buffer));
+  }
+}
+
+inline LocalAllocatorPool &LocalAllocator::get_pool(const int64_t size, const int64_t alignment)
+{
+  BLI_assert(size > 0);
+  BLI_assert(alignment <= size);
+  BLI_assert(alignment <= s_alignment);
+  BLI_assert(is_power_of_2_i(alignment));
+  UNUSED_VARS_NDEBUG(alignment);
+
+  BLI_assert(this->is_local());
+  if (size <= 64) {
+    return small_buffer_pools_[(size - 1) >> 3];
+  }
+  const int key = bitscan_reverse_uint64(uint64_t(size));
+  return *large_buffer_pools_.lookup_or_add_cb(key, [&]() {
+    auto pool = std::make_unique<LocalAllocatorPool>();
+    pool->element_size = int64_t(1) << (8 * sizeof(int64_t) - key);
+    pool->alignment = s_alignment;
+    return pool;
+  });
+}
+
+inline void *LocalAllocator::allocate_with_head(int64_t size, int64_t alignment)
+{
+  const int64_t buffer_size = size + std::max<int64_t>(alignment, sizeof(Head));
+  const int64_t buffer_alignment = std::max<int64_t>(alignment, alignof(Head));
+  void *buffer = this->allocate(buffer_size, buffer_alignment);
+  Head *head = new (buffer) Head;
+  head->buffer_size = buffer_size;
+  head->buffer_alignment = buffer_alignment;
+  return head + 1;
+}
+
+inline void LocalAllocator::deallocate_with_head(const void *buffer)
+{
+  const Head *head = static_cast<const Head *>(buffer) - 1;
+  this->deallocate(head, head->buffer_size, head->buffer_alignment);
+}
+
+template<typename T, typename... Args> inline T &LocalAllocator::allocate_new(Args &&...args)
+{
+  void *buffer = this->allocate(sizeof(T), alignof(T));
+  T *value = new (buffer) T(std::forward<Args>(args)...);
+  return *value;
+}
+
+template<typename T, typename... Args> inline void LocalAllocator::destruct_free(const T *value)
+{
+  std::destroy_at(value);
+  this->deallocate(value, sizeof(T), alignof(T));
+}
+
+template<typename T> MutableSpan<T> inline LocalAllocator::allocate_array(const int64_t size)
+{
+  if (size == 0) {
+    return {};
+  }
+  void *buffer = this->allocate(size * sizeof(T), alignof(T));
+  return {static_cast<T *>(buffer), size};
+}
+
+template<typename T, typename... Args>
+MutableSpan<T> inline LocalAllocator::allocate_new_array(const int64_t size, Args &&...args)
+{
+  MutableSpan<T> array = this->allocate_array<T>(size);
+  for (const int64_t i : IndexRange(size)) {
+    new (&array[i]) T(std::forward<Args>(args)...);
+  }
+  return array;
+}
+
+template<typename T> inline void LocalAllocator::destruct_free_array(Span<T> data)
+{
+  if (data.is_empty()) {
+    return;
+  }
+  destruct_n(const_cast<T *>(data.data()), data.size());
+  this->deallocate(data.data(), data.size_in_bytes(), alignof(T));
+}
+
+template<typename T> inline void LocalAllocator::destruct_free_array(MutableSpan<T> data)
+{
+  this->destruct_free_array(data.as_span());
+}
+
+inline LocalAllocator &LocalAllocatorSet::local()
+{
+  return allocator_by_thread_.local();
+}
+
+}  // namespace blender
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index 7c24fb8b5e7..d9a287679f5 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -91,6 +91,7 @@ set(SRC
   intern/lazy_threading.cc
   intern/length_parameterize.cc
   intern/listbase.cc
+  intern/local_allocator.cc
   intern/math_base.c
   intern/math_base_inline.c
   intern/math_base_safe_inline.c
@@ -256,6 +257,7 @@ set(SRC
   BLI_linklist_stack.h
   BLI_listbase.h
   BLI_listbase_wrapper.hh
+  BLI_local_allocator.hh
   BLI_map.hh
   BLI_map_slots.hh
   BLI_math.h
@@ -484,6 +486,7 @@ if(WITH_GTESTS)
     tests/BLI_linear_allocator_test.cc
     tests/BLI_linklist_lockfree_test.cc
     tests/BLI_listbase_test.cc
+    tests/BLI_local_allocator_test.cc
     tests/BLI_map_test.cc
     tests/BLI_math_base_safe_test.cc
     tests/BLI_math_base_test.cc
diff --git a/source/blender/blenlib/intern/local_allocator.cc b/source/blender/blenlib/intern/local_allocator.cc
new file mode 100644
index 00000000000..73015d3cff5
--- /dev/null
+++ b/source/blender/blenlib/intern/local_allocator.cc
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "BLI_local_allocator.hh"
+
+namespace blender {
+
+LocalAllocatorSet::LocalAllocatorSet()
+    : allocator_by_thread_([this]() { return LocalAllocator{*this}; })
+{
+}
+
+LocalAllocatorSet::~LocalAllocatorSet() = default;
+
+LocalAllocator::LocalAllocator(LocalAllocatorSet &owner_set) : owner_set_(owner_set)
+{
+  linear_allocator_.provide_buffer(initial_buffer_);
+  for (const int64_t i : IndexRange(small_buffer_pools_.size())) {
+    LocalAllocatorPool &pool = small_buffer_pools_[i];
+    pool.element_size = 8 * (i + 1);
+    pool.alignment = power_of_2_min_u(pool.element_size);
+  }
+}
+
+LocalAllocator::~LocalAllocator() = default;
+
+}  // namespace blender
diff --git a/source/blender/blenlib/tests/BLI_local_allocator_test.cc b/source/blender/blenlib/tests/BLI_local_allocator_test.cc
new file mode 100644
index 00000000000..1e7feca5b39
--- /dev/null
+++ b/source/blender/blenlib/tests/BLI_local_allocator_test.cc
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+#include "BLI_local_allocator.hh"
+#include "BLI_strict_flags.h"
+
+#include "testing/testing.h"
+
+namespace blender::tests {
+
+}  // namespace blender::tests
diff --git a/source/blender/functions/FN_lazy_function.hh b/source/blender/functions/FN_lazy_function.hh
index 5379e5f8506..a3f6987573a 100644
--- a/source/blender/functions/FN_lazy_function.hh
+++ b/source/blender/functions/FN_lazy_function.hh
@@ -42,6 +42,7 @@
 #include "BLI_function_ref.hh"
 #include "BLI_generic_pointer.hh"
 #include "BLI_linear_allocator.hh"
+#include "BLI_local_allocator.hh"
 #include "BLI_vector.hh"
 
 #include <atomic>
@@ -98,6 +99,8 @@ struct Context {
    * Custom user data that can be used in the function.
    */
   UserData *user_data;
+
+  LocalAllocator *allocator;
 };
 
 /**
@@ -276,12 +279,12 @@ class LazyFunction {
    * Allocates storage for this function. The storage will be passed to every call to #execute.
    * If the function does not keep track of any state, this does not have to be implemented.
    */
-  virtual void *init_storage(LinearAllocator<> &allocator) const;
+  virtual void *init_storage(LocalAllocator &allocator) const;
 
   /**
    * Destruct the storage created in #init_storage.
    */
-  virtual void destruct_storage(void *storage) const;
+  virtual void destruct_storage(void *storage, LocalAllocator &allocator) const;
 
   /**
    * Calls `fn` with the input indices that the given `output_index` may depend on. By default
diff --git a/source/blender/functions/FN_lazy_function_execute.hh b/source/blender/functions/FN_lazy_function_execute.hh
index 5785fddaa51..c604ba19deb 100644
--- a/source/blender/functions/FN_lazy_function_execute.hh
+++ b/source/blender/functions/FN_lazy_function_execute.hh
@@ -85,14 +85,16 @@ inline void execute_lazy_function_eagerly_impl(
       ...);
   output_usages.fill(ValueUsage::Used);
   set_outputs.fill(false);
-  LinearAllocator<> allocator;
+  LocalAllocatorSet allocator_set;
+  LocalAllocator &allocator = allocator_set.local();
   Context context;
   context.user_data = user_data;
   context.storage = fn.init_storage(allocator);
+  context.allocator = &allocator;
   BasicParams params{
       fn, input_pointers, output_pointers, input_usages, output_usages, set_outputs};
   fn.execute(params, context);
-  fn.destruct_storage(context.storage);
+  fn.destruct_storage(context.storage, allocator);
 
   /* Make sure all outputs have been computed.  */
   BLI_assert(!Span<bool>(set_outputs).contains(false));
diff --git a/source/blender/functions/FN_lazy_function_graph_executor.hh b/source/blender/functions/FN_lazy_function_graph_executor.hh
index a6ae5cac967..a13ca907c21 100644
--- a/source/blender/functions/FN_lazy_function_graph_executor.hh
+++ b/source/blender/functions/FN_lazy_function_graph_executor.hh
@@ -59,11 +59,23 @@ class GraphExecutor : public LazyFunction {
   using Logger = GraphExecutorLogger;
   using SideEffectProvider = GraphExecutorSideEffectProvider;
 
+  struct NodeBufferOffsets {
+    int node;
+    int inputs;
+    int outputs;
+  };
+
+  struct PreprocessData {
+    Array<NodeBufferOffsets> offsets;
+    int node_state_buffer_size;
+  };
+
  private:
   /**
    * The graph that is evaluated.
    */
   const Graph &graph_;
+  const PreprocessData &preprocess_data_;
   /**
    * Input and output sockets of the entire graph.
    */
@@ -85,11 +97,14 @@ class GraphExecutor : public LazyFunction {
   GraphExecutor(const Graph &graph,
                 Span<const OutputSocket *> graph_inputs,
                 Span<const InputSocket *> graph_outputs,
+                const PreprocessData &preprocess_data,
                 const Logger *logger,
                 const SideEffectProvider *side_effect_provider);
 
-  void *init_storage(LinearAllocator<> &allocator) const override;
-  void destruct_storage(void *storage) const override;
+  void *init_storage(LocalAllocator &allocator) const override;
+  void destruct_storage(void *storage, LocalAllocator &allocator) const override;
+
+  static void preprocess(const Graph &graph, PreprocessData &r_preprocess_data);
 
  private:
   void execute_impl(Params &params, const Context &context) const override;
diff --git a/source/blender/functions/intern/lazy_function.cc b/source/blender/functions/intern/lazy_function.cc
index d42b1889160..e5d7481f4d9 100644
--- a/source/blender/functions/intern/lazy_function.cc
+++ b/source/blender/functions/intern/lazy_function.cc
@@ -25,12 +25,12 @@ std::string LazyFunction::output_name(int index) const
   return outputs_[index].debug_name;
 }
 
-void *LazyFunction::init_storage(LinearAllocator<> & /*allocator*/) const
+void *LazyFunction::init_storage(LocalAllocator & /*allocator*/) const
 {
   return nullptr;
 }
 
-void LazyFunction::destruct_storage(void *storage) const
+void LazyFunction::destruct_storage(void *storage, LocalAllocator & /*allocator*/) const
 {
   BLI_assert(storage == nullptr);
   UNUSED_VARS_NDEBUG(storage);
diff --git a/source/blender/functions/intern/lazy_function_graph_executor.cc b/source/blender/functions/intern/lazy_function_graph_executor.cc
index 4270885fc98..55efecab4a9 100644
--- a/source/blender/functions/intern/lazy_function_graph_executor.cc
+++ b/source/blender/functions/intern/lazy_function_graph_executor.cc
@@ -75,7 +75,7 @@ enum class NodeScheduleState {
   RunningAndRescheduled,
 };
 
-struct InputState {
+struct alignas(8) InputState {
   /**
    * Value of this input socket. By default, the value is empty. When other nodes are done
    * computing their outputs, the computed values will be forwarded to linked input sockets. The
@@ -97,7 +97,7 @@ struct InputState {
   bool was_ready_for_execution = false;
 };
 
-struct OutputState {
+struct alignas(8) OutputState {
   /**
    * Keeps track of how the output value is used. If a connected input becomes used, this output
    * has to become used as well. The output becomes unused when it is used by no input socket
@@ -127,7 +127,7 @@ struct OutputState {
   void *value = nullptr;
 };
 
-struct NodeState {
+struct alignas(8) NodeState {
   /**
    * Needs to be locked when any data in this state is accessed that is not explicitly marked as
    * not needing the lock.
@@ -271,7 +271,7 @@ class Executor {
   /**
    * State of every node, indexed by #Node::index_in_graph.
    */
-  Array<NodeState *> node_states_;
+  MutableSpan<NodeState *> node_states_;
   /**
    * Parameters provided by the caller. This is always non-null, while a node is running.
    */
@@ -285,15 +285,7 @@ class Executor {
 #ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
   std::thread::id current_main_thread_;
 #endif
-  /**
-   * A separate linear allocator for every thread. We could potentially reuse some memory, but that
-   * doesn't seem worth it yet.
-   */
-  struct ThreadLocalData {
-    LinearAllocator<> allocator;
-  };
-  std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
-  LinearAllocator<> main_allocator_;
+
   /**
    * Set to false when the first execution ends.
    */
@@ -308,18 +300,25 @@ class Executor {
     BLI_assert(self_.graph_.node_indices_are_valid());
   }
 
-  ~Executor()
+  void destruct_state(LocalAllocator &allocator)
   {
     if (TaskPool *task_pool = task_pool_.load()) {
       BLI_task_pool_free(task_pool);
     }
     threading::parallel_for(node_states_.index_range(), 1024, [&](const IndexRange range) {
+      LocalAllocator &local_allocator = allocator.local();
       for (const int node_index : range) {
         const Node &node = *self_.graph_.nodes()[node_index];
         NodeState &node_state = *node_states_[node_index];
-        this->destruct_node_state(node, node_state);
+        if (!node_state.node_has_finished) {
+          this->destruct_node_data(node, node_state, local_allocator);
+        }
+        std::destroy_at(&node_state);
       }
     });
+    allocator.deallocate(
+        node_states_[0], self_.preprocess_data_.node_state_buffer_size, alignof(NodeState));
+    allocator.destruct_free_array(node_states_);
   }
 
   /**
@@ -364,7 +363,7 @@ class Executor {
         }
       }
 
-      this->initialize_static_value_usages(side_effect_nodes);
+      this->initialize_static_value_usages(side_effect_nodes, this->get_local_allocator());
       this->schedule_side_effect_nodes(side_effect_nodes, current_task);
     }
 
@@ -382,54 +381,41 @@ class Executor {
   void initialize_node_states()
   {
     Span<const Node *> nodes = self_.graph_.nodes();
-    node_states_.reinitialize(nodes.size());
+    node_states_ = context_->allocator->allocate_array<NodeState *>(nodes.size());
 
-    auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
-      for (const int i : range) {
-        const Node &node = *nodes[i];
-        NodeState &node_state = *allocator.construct<NodeState>().release();
-        node_states_[i] = &node_state;
-        this->construct_initial_node_state(allocator, node, node_state);
-      }
-    };
-    if (nodes.size() <= 256) {
-      construct_node_range(nodes.index_range(), main_allocator_);
-    }
-    else {
-      this->ensure_thread_locals();
-      /* Construct all node states in parallel. */
-      threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
-        LinearAllocator<> &allocator = thread_locals_->local().allocator;
-        construct_node_range(range, allocator);
-      });
+    void *node_states_buffer = context_->allocator->allocate(
+        self_.preprocess_data_.node_state_buffer_size, alignof(NodeState));
+
+    for (const int i : nodes.index_range()) {
+      const Node &node = *nodes[i];
+      const GraphExecutor::NodeBufferOffsets &node_offsets = self_.preprocess_data_.offsets[i];
+      void *state_buffer = POINTER_OFFSET(node_states_buffer, node_offsets.node);
+      NodeState *node_state = new (state_buffer) NodeState();
+      node_state->inputs = {
+          static_cast<InputState *>(POINTER_OFFSET(node_states_buffer, node_offsets.inputs)),
+          node.inputs().size()};
+      node_state->outputs = {
+          static_cast<OutputState *>(POINTER_OFFSET(node_states_buffer, node_offsets.outputs)),
+          node.outputs().size()};
+      default_construct_n(node_state->inputs.data(), node_state->inputs.size());
+      default_construct_n(node_state->outputs.data(), node_state->outputs.size());
+      node_states_[i] = node_state;
     }
   }
 
-  void construct_initial_node_state(LinearAllocator<> &allocator,
-                                    const Node &node,
-                                    NodeState &node_state)
-  {
-    const Span<const InputSocket *> node_inputs = node.inputs();
-    const Span<const OutputSocket *> node_outputs = node.outputs();
-
-    node_state.inputs = allocator.construct_array<InputState>(node_inputs.size());
-    node_state.outputs = allocator.construct_array<OutputState>(node_outputs.size());
-  }
-
-  void destruct_node_state(const Node &node, NodeState &node_state)
+  void destruct_node_data(const Node &node, NodeState &node_state, LocalAllocator &allocator)
   {
     if (node.is_function()) {
       const LazyFunction &fn = static_cast<const FunctionNode &>(node).function();
       if (node_state.storage != nullptr) {
-        fn.destruct_storage(node_state.storage);
+        fn.destruct_storage(node_state.storage, allocator);
       }
     }
     for (const int i : node.inputs().index_range()) {
       InputState &input_state = node_state.inputs[i];
       const InputSocket &input_socket = node.input(i);
-      this->destruct_input_value_if_exists(input_state, input_socket.type());
+      this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
     }
-    std::destroy_at(&node_state);
   }
 
   /**
@@ -453,7 +439,7 @@ class Executor {
           this->set_input_required(locked_node, socket);
         }
         else {
-          this->set_input_unused(locked_node, socket);
+          this->set_input_unused(locked_node, socket, this->get_local_allocator());
         }
       });
     }
@@ -500,13 +486,14 @@ class Executor {
    * Most importantly, this function initializes `InputState.usage` and
    * `OutputState.potential_target_sockets`.
    */
-  void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes)
+  void initialize_static_value_usages(const Span<const FunctionNode *> side_effect_nodes,
+                                      LocalAllocator &allocator)
   {
     const Span<const Node *> all_nodes = self_.graph_.nodes();
 
     /* Used for a search through all nodes that outputs depend on. */
-    Stack<const Node *> reachable_nodes_to_check;
-    Array<bool> reachable_node_flags(all_nodes.size(), false);
+    Stack<const Node *, 16, LocalAllocatorRef> reachable_nodes_to_check{allocator};
+    Array<bool, 16, LocalAllocatorRef> reachable_node_flags{all_nodes.size(), false, allocator};
 
     /* Graph outputs are always reachable. */
     for (const InputSocket *socket : self_.graph_outputs_) {
@@ -586,7 +573,7 @@ class Executor {
 
   void forward_newly_provided_inputs(CurrentTask &current_task)
   {
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
     for (const int graph_input_index : self_.graph_inputs_.index_range()) {
       std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
       if (was_loaded.load()) {
@@ -605,7 +592,7 @@ class Executor {
   }
 
   void forward_newly_provided_input(CurrentTask &current_task,
-                                    LinearAllocator<> &allocator,
+                                    LocalAllocator &allocator,
                                     const int graph_input_index,
                                     void *input_data)
   {
@@ -621,7 +608,6 @@ class Executor {
     const Node &node = socket.node();
     const int index_in_node = socket.index();
     NodeState &node_state = *node_states_[node.index_in_graph()];
-    OutputState &output_state = node_state.outputs[index_in_node];
 
     /* The notified output socket might be an input of the entire graph. In this case, notify the
      * caller that the input is required. */
@@ -640,12 +626,13 @@ class Executor {
         return;
       }
       this->forward_newly_provided_input(
-          current_task, this->get_main_or_local_allocator(), graph_input_index, input_data);
+          current_task, this->get_local_allocator(), graph_input_index, input_data);
       return;
     }
 
     BLI_assert(node.is_function());
     this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
+      OutputState &output_state = node_state.outputs[index_in_node];
       if (output_state.usage == ValueUsage::Used) {
         return;
       }
@@ -659,9 +646,9 @@ class Executor {
     const Node &node = socket.node();
     const int index_in_node = socket.index();
     NodeState &node_state = *node_states_[node.index_in_graph()];
-    OutputState &output_state = node_state.outputs[index_in_node];
 
     this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
+      OutputState &output_state = node_state.outputs[index_in_node];
       output_state.potential_target_sockets -= 1;
       if (output_state.potential_target_sockets == 0) {
         BLI_assert(output_state.usage != ValueUsage::Unused);
@@ -760,7 +747,7 @@ class Executor {
   void run_node_task(const FunctionNode &node, CurrentTask &current_task)
   {
     NodeState &node_state = *node_states_[node.index_in_graph()];
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
     const LazyFunction &fn = node.function();
 
     bool node_needs_execution = false;
@@ -799,6 +786,7 @@ class Executor {
         node_state.always_used_inputs_requested = true;
       }
 
+      const bool allow_missing_requested_inputs = fn.allow_missing_requested_inputs();
       for (const int input_index : node_state.inputs.index_range()) {
         InputState &input_state = node_state.inputs[input_index];
         if (input_state.was_ready_for_execution) {
@@ -808,7 +796,11 @@ class Executor {
           input_state.was_ready_for_execution = true;
           continue;
         }
-        if (!fn.allow_missing_requested_inputs()) {
+        const InputSocket &socket = node.input(input_index);
+        if (socket.origin() == nullptr) {
+          continue;
+        }
+        if (!allow_missing_requested_inputs) {
           if (input_state.usage == ValueUsage::Used) {
             return;
           }
@@ -848,7 +840,7 @@ class Executor {
       /* Importantly, the node must not be locked when it is executed. That would result in locks
        * being hold very long in some cases and results in multiple locks being hold by the same
        * thread in the same graph which can lead to deadlocks. */
-      this->execute_node(node, node_state, current_task);
+      this->execute_node(node, node_state, current_task, allocator);
     }
 
     this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
@@ -857,7 +849,7 @@ class Executor {
         this->assert_expected_outputs_have_been_computed(locked_node);
       }
 #endif
-      this->finish_node_if_possible(locked_node);
+      this->finish_node_if_possible(locked_node, allocator);
       const bool reschedule_requested = node_state.schedule_state ==
                                         NodeScheduleState::RunningAndRescheduled;
       node_state.schedule_state = NodeScheduleState::NotScheduled;
@@ -895,7 +887,7 @@ class Executor {
     }
   }
 
-  void finish_node_if_possible(LockedNode &locked_node)
+  void finish_node_if_possible(LockedNode &locked_node, LocalAllocator &allocator)
   {
     const Node &node = locked_node.node;
     NodeState &node_state = locked_node.node_state;
@@ -923,44 +915,44 @@ class Executor {
       const InputSocket &input_socket = node.input(input_index);
       InputState &input_state = node_state.inputs[input_index];
       if (input_state.usage == ValueUsage::Maybe) {
-        this->set_input_unused(locked_node, input_socket);
-      }
-      else if (input_state.usage == ValueUsage::Used) {
-        this->destruct_input_value_if_exists(input_state, input_socket.type());
+        this->set_input_unused(locked_node, input_socket, allocator);
       }
     }
 
-    if (node_state.storage != nullptr) {
-      if (node.is_function()) {
-        const FunctionNode &fn_node = static_cast<const FunctionNode &>(node);
-        fn_node.function().destruct_storage(node_state.storage);
-      }
-      node_state.storage = nullptr;
-    }
+    this->destruct_node_data(node, node_state, allocator);
   }
 
-  void destruct_input_value_if_exists(InputState &input_state, const CPPType &type)
+  void destruct_input_value_if_exists(InputState &input_state,
+                                      const CPPType &type,
+                                      LocalAllocator &allocator)
   {
     if (input_state.value != nullptr) {
       type.destruct(input_state.value);
+      allocator.deallocate(input_state.value, type.size(), type.alignment());
       input_state.value = nullptr;
     }
   }
 
-  void execute_node(const FunctionNode &node, NodeState &node_state, CurrentTask &current_task);
+  void execute_node(const FunctionNode &node,
+                    NodeState &node_state,
+                    CurrentTask &current_task,
+                    LocalAllocator &allocator);
 
   void set_input_unused_during_execution(const Node &node,
                                          NodeState &node_state,
                                          const int input_index,
                                          CurrentTask &current_task)
   {
+    LocalAllocator &allocator = this->get_local_allocator();
     const InputSocket &input_socket = node.input(input_index);
     this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
-      this->set_input_unused(locked_node, input_socket);
+      this->set_input_unused(locked_node, input_socket, allocator);
     });
   }
 
-  void set_input_unused(LockedNode &locked_node, const InputSocket &input_socket)
+  void set_input_unused(LockedNode &locked_node,
+                        const InputSocket &input_socket,
+                        LocalAllocator &allocator)
   {
     NodeState &node_state = locked_node.node_state;
     const int input_index = input_socket.index();
@@ -972,7 +964,7 @@ class Executor {
     }
     input_state.usage = ValueUsage::Unused;
 
-    this->destruct_input_value_if_exists(input_state, input_socket.type());
+    this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
     if (input_state.was_ready_for_execution) {
       return;
     }
@@ -1026,7 +1018,7 @@ class Executor {
                                       CurrentTask &current_task)
   {
     BLI_assert(value_to_forward.get() != nullptr);
-    LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+    LocalAllocator &allocator = this->get_local_allocator();
     const CPPType &type = *value_to_forward.type();
 
     if (self_.logger_ != nullptr) {
@@ -1038,17 +1030,7 @@ class Executor {
       const Node &target_node = target_socket->node();
       NodeState &node_state = *node_states_[target_node.index_in_graph()];
       const int input_index = target_socket->index();
-      InputState &input_state = node_state.inputs[input_index];
       const bool is_last_target = target_socket == targets.last();
-#ifdef DEBUG
-      if (input_state.value != nullptr) {
-        if (self_.logger_ != nullptr) {
-          self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_);
-        }
-        BLI_assert_unreachable();
-      }
-#endif
-      BLI_assert(!input_state.was_ready_for_execution);
       BLI_assert(target_socket->type() == type);
       BLI_assert(target_socket->origin() == &from_socket);
 
@@ -1072,6 +1054,18 @@ class Executor {
         continue;
       }
       this->with_locked_node(target_node, node_state, current_task, [&](LockedNode &locked_node) {
+        InputState &input_state = node_state.inputs[input_index];
+
+#ifdef DEBUG
+        if (input_state.value != nullptr) {
+          if (self_.logger_ != nullptr) {
+            self_.logger_->dump_when_input_is_set_twice(*target_socket, from_socket, *context_);
+          }
+          BLI_assert_unreachable();
+        }
+#endif
+        BLI_assert(!input_state.was_ready_for_execution);
+
         if (input_state.usage == ValueUsage::Unused) {
           return;
         }
@@ -1089,6 +1083,7 @@ class Executor {
     }
     if (value_to_forward.get() != nullptr) {
       value_to_forward.destruct();
+      allocator.deallocate(value_to_forward.get(), type.size(), type.alignment());
     }
   }
 
@@ -1145,23 +1140,10 @@ class Executor {
     if (BLI_system_thread_count() <= 1) {
       return false;
     }
-    this->ensure_thread_locals();
     task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
     return true;
   }
 
-  void ensure_thread_locals()
-  {
-#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
-    if (current_main_thread_ != std::this_thread::get_id()) {
-      BLI_assert_unreachable();
-    }
-#endif
-    if (!thread_locals_) {
-      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
-    }
-  }
-
   /**
    * Allow other threads to steal all the nodes that are currently scheduled on this thread.
    */
@@ -1194,12 +1176,12 @@ class Executor {
         [](TaskPool * /*pool*/, void *data) { MEM_delete(static_cast<ScheduledNodes *>(data)); });
   }
 
-  LinearAllocator<> &get_main_or_local_allocator()
+  LocalAllocator &get_local_allocator()
   {
     if (this->use_multi_threading()) {
-      return thread_locals_->local().allocator;
+      return context_->allocator->local();
     }
-    return main_allocator_;
+    return *context_->allocator;
   }
 };
 
@@ -1248,7 +1230,7 @@ class GraphExecutorLFParams final : public Params {
     OutputState &output_state = node_state_.outputs[index];
     BLI_assert(!output_state.has_been_computed);
     if (output_state.value == nullptr) {
-      LinearAllocator<> &allocator = executor_.get_main_or_local_allocator();
+      LocalAllocator &allocator = executor_.get_local_allocator();
       const CPPType &type = node_.output(index).type();
       output_state.value = allocator.allocate(type.size(), type.alignment());
     }
@@ -1297,13 +1279,15 @@ class GraphExecutorLFParams final : public Params {
  */
 inline void Executor::execute_node(const FunctionNode &node,
                                    NodeState &node_state,
-                                   CurrentTask &current_task)
+                                   CurrentTask &current_task,
+                                   LocalAllocator &allocator)
 {
   const LazyFunction &fn = node.function();
   GraphExecutorLFParams node_params{fn, *this, node, node_state, current_task};
   BLI_assert(context_ != nullptr);
   Context fn_context = *context_;
   fn_context.storage = node_state.storage;
+  fn_context.allocator = &allocator;
 
   if (self_.logger_ != nullptr) {
     self_.logger_->log_before_node_execute(node, node_params, fn_context);
@@ -1330,12 +1314,32 @@ inline void Executor::execute_node(const FunctionNode &node,
   }
 }
 
+void GraphExecutor::preprocess(const Graph &graph, PreprocessData &r_preprocess_data)
+{
+  const Span<const Node *> nodes = graph.nodes();
+  r_preprocess_data.offsets.reinitialize(nodes.size());
+  int offset = 0;
+  for (const int i : nodes.index_range()) {
+    const Node &node = *nodes[i];
+    NodeBufferOffsets &node_offsets = r_preprocess_data.offsets[i];
+    node_offsets.node = offset;
+    offset += sizeof(NodeState);
+    node_offsets.inputs = offset;
+    offset += sizeof(InputState) * node.inputs().size();
+    node_offsets.outputs = offset;
+    offset += sizeof(OutputState) * node.outputs().size();
+  }
+  r_preprocess_data.node_state_buffer_size = offset;
+}
+
 GraphExecutor::GraphExecutor(const Graph &graph,
                              const Span<const OutputSocket *> graph_inputs,
                              const Span<const InputSocket *> graph_outputs,
+                             const PreprocessData &preprocess_data,
                              const Logger *logger,
                              const SideEffectProvider *side_effect_provider)
     : graph_(graph),
+      preprocess_data_(preprocess_data),
       graph_inputs_(graph_inputs),
       graph_outputs_(graph_outputs),
       logger_(logger),
@@ -1360,15 +1364,17 @@ void GraphExecutor::execute_impl(Params &params, const Context &context) const
   executor.execute(params, context);
 }
 
-void *GraphExecutor::init_storage(LinearAllocator<> &allocator) const
+void *GraphExecutor::init_storage(LocalAllocator &allocator) const
 {
-  Executor &executor = *allocator.construct<Executor>(*this).release();
+  Executor &executor = allocator.allocate_new<Executor>(*this);
   return &executor;
 }
 
-void GraphExecutor::destruct_storage(void *storage) const
+void GraphExecutor::destruct_storage(void *storage, LocalAllocator &allocator) const
 {
-  std::destroy_at(static_cast<Executor *>(storage));
+  Executor *executor = static_cast<Executor *>(storage);
+  executor->destruct_state(allocator);
+  allocator.destruct_free(executor);
 }
 
 void GraphExecutorLogger::log_socket_value(const Socket &socket,
diff --git a/source/blender/functions/tests/FN_lazy_function_test.cc b/source/blender/functions/tests/FN_lazy_function_test.cc
index 54e1df00cdf..9776a6b4d77 100644
--- a/source/blender/functions/tests/FN_lazy_function_test.cc
+++ b/source/blender/functions/tests/FN_lazy_function_test.cc
@@ -105,7 +105,11 @@ TEST(lazy_function, SideEffects)
 
   SimpleSideEffectProvider side_effect_provider{{&store_node}};
 
-  GraphExecutor executor_fn{graph, {&input_node.output(0)}, {}, nullptr, &side_effect_provider};
+  GraphExecutor::PreprocessData preprocess_data;
+  GraphExecutor::preprocess(graph, preprocess_data);
+
+  GraphExecutor executor_fn{
+      graph, {&input_node.output(0)}, {}, preprocess_data, nullptr, &side_effect_provider};
   execute_lazy_function_eagerly(executor_fn, nullptr, std::make_tuple(5), std::make_tuple());
 
   EXPECT_EQ(dst1, 15);
@@ -167,8 +171,11 @@ TEST(lazy_function, GraphWithCycle)
 
   graph.update_node_indices();
 
+  GraphExecutor::PreprocessData preprocess_data;
+  GraphExecutor::preprocess(graph, preprocess_data);
+
   GraphExecutor executor_fn{
-      graph, {&input_node.output(0)}, {&output_node.input(0)}, nullptr, nullptr};
+      graph, {&input_node.output(0)}, {&output_node.input(0)}, preprocess_data, nullptr, nullptr};
   int result = 0;
   execute_lazy_function_eagerly(
       executor_fn, nullptr, std::make_tuple(10), std::make_tuple(&result));
diff --git a/source/blender/modifiers/intern/MOD_nodes.cc b/source/blender/modifiers/intern/MOD_nodes.cc
index 15aab38bed7..d8900a05bc1 100644
--- a/source/blender/modifiers/intern/MOD_nodes.cc
+++ b/source/blender/modifiers/intern/MOD_nodes.cc
@@ -1146,8 +1146,12 @@ static GeometrySet compute_geometry(
   blender::nodes::GeometryNodesLazyFunctionLogger lf_logger(lf_graph_info);
   blender::nodes::GeometryNodesLazyFunctionSideEffectProvider lf_side_effect_provider;
 
-  lf::GraphExecutor graph_executor{
-      lf_graph_info.graph, graph_inputs, graph_outputs, &lf_logger, &lf_side_effect_provider};
+  lf::GraphExecutor graph_executor{lf_graph_info.graph,
+                                   graph_inputs,
+                                   graph_outputs,
+                                   lf_graph_info.graph_preprocess_data,
+                                   &lf_logger,
+                                   &lf_side_effect_provider};
 
   blender::nodes::GeoNodesModifierData geo_nodes_modifier_data;
   geo_nodes_modifier_data.depsgraph = ctx->depsgraph;
@@ -1169,7 +1173,9 @@ static GeometrySet compute_geometry(
   blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
   user_data.compute_context = &modifier_compute_context;
 
-  blender::LinearAllocator<> allocator;
+  blender::LocalAllocatorSet allocator_set;
+  blender::LocalAllocator &allocator = allocator_set.local();
+
   Vector<GMutablePointer> inputs_to_destruct;
 
   int input_index = -1;
@@ -1212,6 +1218,7 @@ static GeometrySet compute_geometry(
   lf::Context lf_context;
   lf_context.storage = graph_executor.init_storage(allocator);
   lf_context.user_data = &user_data;
+  lf_context.allocator = &allocator;
   lf::BasicParams lf_params{graph_executor,
                             param_inputs,
                             param_outputs,
@@ -1219,7 +1226,7 @@ static GeometrySet compute_geometry(
                             param_output_usages,
                             param_set_outputs};
   graph_executor.execute(lf_params, lf_context);
-  graph_executor.destruct_storage(lf_context.storage);
+  graph_executor.destruct_storage(lf_context.storage, allocator);
 
   for (GMutablePointer &ptr : inputs_to_destruct) {
     ptr.destruct();
@@ -1289,6 +1296,7 @@ static void modifyGeometry(ModifierData *md,
                            const ModifierEvalContext *ctx,
                            GeometrySet &geometry_set)
 {
+  SCOPED_TIMER_AVERAGED(__func__);
   NodesModifierData *nmd = reinterpret_cast<NodesModifierData *>(md);
   if (nmd->node_group == nullptr) {
     return;
diff --git a/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh b/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh
index 793798c4974..159db3bd477 100644
--- a/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh
+++ b/source/blender/nodes/NOD_geometry_nodes_lazy_function.hh
@@ -187,6 +187,7 @@ struct GeometryNodesLazyFunctionGraphInfo {
    * Mappings between the lazy-function graph and the #bNodeTree.
    */
   GeometryNodeLazyFunctionGraphMapping mapping;
+  lf::GraphExecutor::PreprocessData graph_preprocess_data;
   /**
    * Approximate number of nodes in the graph if all sub-graphs were inlined.
    * This can be used as a simple heuristic for the complexity of the node group.
diff --git a/source/blender/nodes/intern/geometry_nodes_lazy_function.cc b/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
index 1a7f1520b12..ad4a3d0168d 100644
--- a/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
+++ b/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
@@ -769,6 +769,7 @@ class LazyFunctionForGroupNode : public LazyFunction {
     graph_executor_.emplace(lf_graph_info.graph,
                             std::move(graph_inputs),
                             std::move(graph_outputs),
+                            lf_graph_info.graph_preprocess_data,
                             &*lf_logger_,
                             &*lf_side_effect_provider_);
   }
@@ -805,18 +806,18 @@ class LazyFunctionForGroupNode : public LazyFunction {
     graph_executor_->execute(params, group_context);
   }
 
-  void *init_storage(LinearAllocator<> &allocator) const override
+  void *init_storage(LocalAllocator &allocator) const override
   {
-    Storage *s = allocator.construct<Storage>().release();
-    s->graph_executor_storage = graph_executor_->init_storage(allocator);
-    return s;
+    Storage &s = allocator.allocate_new<Storage>();
+    s.graph_executor_storage = graph_executor_->init_storage(allocator);
+    return &s;
   }
 
-  void destruct_storage(void *storage) const override
+  void destruct_storage(void *storage, LocalAllocator &allocator) const override
   {
     Storage *s = static_cast<Storage *>(storage);
-    graph_executor_->destruct_storage(s->graph_executor_storage);
-    std::destroy_at(s);
+    graph_executor_->destruct_storage(s->graph_executor_storage, allocator);
+    allocator.destruct_free(s);
   }
 
   std::string name() const override
@@ -1243,6 +1244,7 @@ struct GeometryNodesLazyFunctionGraphBuilder {
 
     lf_graph_->update_node_indices();
     lf_graph_info_->num_inline_nodes_approximate += lf_graph_->nodes().size();
+    lf::GraphExecutor::preprocess(*lf_graph_, lf_graph_info_->graph_preprocess_data);
   }
 
  private: