9 changed files with 76 additions and 91 deletions
--- a/source/blender/blenlib/BLI_local_pool.hh
+++ b/source/blender/blenlib/BLI_local_pool.hh
@ -5,6 +5,7 @@

 #include "BLI_allocator.hh"
 #include "BLI_asan.h"
+#include "BLI_enumerable_thread_specific.hh"
 #include "BLI_map.hh"
 #include "BLI_math_bits.h"
 #include "BLI_stack.hh"
@ -13,14 +14,10 @@

 namespace blender {

-class LocalPoolScope {
-};
-
 template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, NonMovable {
 private:
  static constexpr int64_t s_alignment = 64;

-  const LocalPoolScope &pool_scope_;
  Vector<MutableSpan<std::byte>> owned_buffers_;

  struct BufferStack {
@ -34,7 +31,7 @@ template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, N
  BLI_NO_UNIQUE_ADDRESS Allocator allocator_;

 public:
-  LocalPool(const LocalPoolScope &pool_scope) : pool_scope_(pool_scope)
+  LocalPool()
  {
    for (const int64_t i : IndexRange(small_stacks_.size())) {
      small_stacks_[i].element_size = 8 * (i + 1);
@ -129,4 +126,20 @@ template<typename Allocator = GuardedAllocator> class LocalPool : NonCopyable, N
  }
 };

+class LocalMemoryPools {
+ private:
+  threading::EnumerableThreadSpecific<LocalPool<>> pool_by_thread_;
+
+ public:
+  LocalPool<> &local()
+  {
+    return pool_by_thread_.local();
+  }
+};
+
+struct Pools {
+  LocalMemoryPools *pools = nullptr;
+  LocalPool<> *local = nullptr;
+};
+
 }  // namespace blender
--- a/source/blender/blenlib/tests/BLI_local_pool_test.cc
+++ b/source/blender/blenlib/tests/BLI_local_pool_test.cc
@ -9,8 +9,7 @@ namespace blender::tests {

 TEST(local_pool, Test)
 {
-  LocalPoolScope pool_scope;
-  LocalPool pool(pool_scope);
+  LocalPool pool;

  std::cout << pool.allocate(30000, 8) << "\n";
 }
--- a/source/blender/functions/FN_lazy_function.hh
+++ b/source/blender/functions/FN_lazy_function.hh
@ -100,7 +100,7 @@ struct Context {
   */
  UserData *user_data;

-  LocalPool<> *local_pool = nullptr;
+  Pools pools;
 };

 /**
@ -279,12 +279,12 @@ class LazyFunction {
   * Allocates storage for this function. The storage will be passed to every call to #execute.
   * If the function does not keep track of any state, this does not have to be implemented.
   */
-  virtual void *init_storage(LocalPool<> &allocator) const;
+  virtual void *init_storage(Pools &pools) const;

  /**
   * Destruct the storage created in #init_storage.
   */
-  virtual void destruct_storage(void *storage, LocalPool<> &allocator) const;
+  virtual void destruct_storage(void *storage, Pools &pools) const;

  /**
   * Calls `fn` with the input indices that the given `output_index` may depend on. By default
--- a/source/blender/functions/FN_lazy_function_execute.hh
+++ b/source/blender/functions/FN_lazy_function_execute.hh
@ -85,15 +85,16 @@ inline void execute_lazy_function_eagerly_impl(
      ...);
  output_usages.fill(ValueUsage::Used);
  set_outputs.fill(false);
-  LocalPoolScope local_pool_scope;
-  LocalPool<> allocator(local_pool_scope);
+  LocalMemoryPools local_pools;
+  Pools pools{&local_pools, &local_pools.local()};
  Context context;
  context.user_data = user_data;
-  context.storage = fn.init_storage(allocator);
+  context.storage = fn.init_storage(pools);
+  context.pools = pools;
  BasicParams params{
      fn, input_pointers, output_pointers, input_usages, output_usages, set_outputs};
  fn.execute(params, context);
-  fn.destruct_storage(context.storage, allocator);
+  fn.destruct_storage(context.storage, pools);

  /* Make sure all outputs have been computed.  */
  BLI_assert(!Span<bool>(set_outputs).contains(false));
--- a/source/blender/functions/FN_lazy_function_graph_executor.hh
+++ b/source/blender/functions/FN_lazy_function_graph_executor.hh
@ -88,8 +88,8 @@ class GraphExecutor : public LazyFunction {
                const Logger *logger,
                const SideEffectProvider *side_effect_provider);

-  void *init_storage(LocalPool<> &allocator) const override;
-  void destruct_storage(void *storage, LocalPool<> &allocator) const override;
+  void *init_storage(Pools &pools) const override;
+  void destruct_storage(void *storage, Pools &pools) const override;

 private:
  void execute_impl(Params &params, const Context &context) const override;
--- a/source/blender/functions/intern/lazy_function.cc
+++ b/source/blender/functions/intern/lazy_function.cc
@ -25,12 +25,12 @@ std::string LazyFunction::output_name(int index) const
  return outputs_[index].debug_name;
 }

-void *LazyFunction::init_storage(LocalPool<> & /*allocator*/) const
+void *LazyFunction::init_storage(Pools & /*pools*/) const
 {
  return nullptr;
 }

-void LazyFunction::destruct_storage(void *storage, LocalPool<> & /*allocator*/) const
+void LazyFunction::destruct_storage(void *storage, Pools & /*pools*/) const
 {
  BLI_assert(storage == nullptr);
  UNUSED_VARS_NDEBUG(storage);
--- a/source/blender/functions/intern/lazy_function_graph_executor.cc
+++ b/source/blender/functions/intern/lazy_function_graph_executor.cc
@ -247,16 +247,7 @@ class Executor {
 #ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
  std::thread::id current_main_thread_;
 #endif
-  LocalPoolScope local_pool_scope_;
-  struct ThreadLocalData {
-    LocalPool<> local_pool;

-    ThreadLocalData(const LocalPoolScope &local_pool_scope) : local_pool(local_pool_scope)
-    {
-    }
-  };
-  std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
-  LocalPool<> main_allocator_;
  /**
   * Set to false when the first execution ends.
   */
@ -265,14 +256,13 @@ class Executor {
  friend GraphExecutorLFParams;

 public:
-  Executor(const GraphExecutor &self)
-      : self_(self), loaded_inputs_(self.graph_inputs_.size()), main_allocator_(local_pool_scope_)
+  Executor(const GraphExecutor &self) : self_(self), loaded_inputs_(self.graph_inputs_.size())
  {
    /* The indices are necessary, because they are used as keys in #node_states_. */
    BLI_assert(self_.graph_.node_indices_are_valid());
  }

-  void destruct_self(LocalPool<> & /*parent_allocator*/)
+  void destruct_self(Pools &pools)
  {
    if (TaskPool *task_pool = task_pool_.load()) {
      BLI_task_pool_free(task_pool);
@ -281,7 +271,7 @@ class Executor {
      for (const int node_index : range) {
        const Node &node = *self_.graph_.nodes()[node_index];
        NodeState &node_state = *node_states_[node_index];
-        this->destruct_node_state(node, node_state, this->get_main_or_local_allocator());
+        this->destruct_node_state(node, node_state, pools);
      }
    });
    this->~Executor();
@ -329,7 +319,7 @@ class Executor {
        }
      }

-      this->initialize_static_value_usages(side_effect_nodes, this->get_main_or_local_allocator());
+      this->initialize_static_value_usages(side_effect_nodes, this->get_local_allocator());
      this->schedule_side_effect_nodes(side_effect_nodes, current_task);
    }

@ -349,25 +339,16 @@ class Executor {
    Span<const Node *> nodes = self_.graph_.nodes();
    node_states_.reinitialize(nodes.size());

-    auto construct_node_range = [&](const IndexRange range, LocalPool<> &allocator) {
+    /* Construct all node states in parallel. */
+    threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
+      LocalPool<> &allocator = this->get_local_allocator();
      for (const int i : range) {
        const Node &node = *nodes[i];
        NodeState &node_state = *allocator.construct<NodeState>().release();
        node_states_[i] = &node_state;
        this->construct_initial_node_state(allocator, node, node_state);
      }
-    };
-    if (nodes.size() <= 256) {
-      construct_node_range(nodes.index_range(), main_allocator_);
-    }
-    else {
-      this->ensure_thread_locals();
-      /* Construct all node states in parallel. */
-      threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
-        LocalPool<> &allocator = thread_locals_->local().local_pool;
-        construct_node_range(range, allocator);
-      });
-    }
+    });
  }

  void construct_initial_node_state(LocalPool<> &allocator,
@ -381,18 +362,18 @@ class Executor {
    node_state.outputs = allocator.construct_array<OutputState>(node_outputs.size());
  }

-  void destruct_node_state(const Node &node, NodeState &node_state, LocalPool<> &allocator)
+  void destruct_node_state(const Node &node, NodeState &node_state, Pools &pools)
  {
    if (node.is_function()) {
      const LazyFunction &fn = static_cast<const FunctionNode &>(node).function();
      if (node_state.storage != nullptr) {
-        fn.destruct_storage(node_state.storage, allocator);
+        fn.destruct_storage(node_state.storage, pools);
      }
    }
    for (const int i : node.inputs().index_range()) {
      InputState &input_state = node_state.inputs[i];
      const InputSocket &input_socket = node.input(i);
-      this->destruct_input_value_if_exists(input_state, input_socket.type(), allocator);
+      this->destruct_input_value_if_exists(input_state, input_socket.type(), *pools.local);
    }
    std::destroy_at(&node_state);
  }
@ -548,7 +529,7 @@ class Executor {

  void forward_newly_provided_inputs(CurrentTask &current_task)
  {
-    LocalPool<> &allocator = this->get_main_or_local_allocator();
+    LocalPool<> &allocator = this->get_local_allocator();
    for (const int graph_input_index : self_.graph_inputs_.index_range()) {
      std::atomic<uint8_t> &was_loaded = loaded_inputs_[graph_input_index];
      if (was_loaded.load()) {
@ -602,7 +583,7 @@ class Executor {
        return;
      }
      this->forward_newly_provided_input(
-          current_task, this->get_main_or_local_allocator(), graph_input_index, input_data);
+          current_task, this->get_local_allocator(), graph_input_index, input_data);
      return;
    }

@ -721,7 +702,7 @@ class Executor {
  void run_node_task(const FunctionNode &node, CurrentTask &current_task)
  {
    NodeState &node_state = *node_states_[node.index_in_graph()];
-    LocalPool<> &allocator = this->get_main_or_local_allocator();
+    LocalPool<> &allocator = this->get_local_allocator();
    const LazyFunction &fn = node.function();

    bool node_needs_execution = false;
@ -787,7 +768,8 @@ class Executor {
    if (node_needs_execution) {
      if (!node_state.storage_and_defaults_initialized) {
        /* Initialize storage. */
-        node_state.storage = fn.init_storage(allocator);
+        Pools pools{context_->pools.pools, &allocator};
+        node_state.storage = fn.init_storage(pools);

        /* Load unlinked inputs. */
        for (const int input_index : node.inputs().index_range()) {
@ -899,7 +881,8 @@ class Executor {
    if (node_state.storage != nullptr) {
      if (node.is_function()) {
        const FunctionNode &fn_node = static_cast<const FunctionNode &>(node);
-        fn_node.function().destruct_storage(node_state.storage, allocator);
+        Pools pools{context_->pools.pools, &allocator};
+        fn_node.function().destruct_storage(node_state.storage, pools);
      }
      node_state.storage = nullptr;
    }
@ -926,7 +909,7 @@ class Executor {
                                         const int input_index,
                                         CurrentTask &current_task)
  {
-    LocalPool<> &allocator = this->get_main_or_local_allocator();
+    LocalPool<> &allocator = this->get_local_allocator();
    const InputSocket &input_socket = node.input(input_index);
    this->with_locked_node(node, node_state, current_task, [&](LockedNode &locked_node) {
      this->set_input_unused(locked_node, input_socket, allocator);
@ -1001,7 +984,7 @@ class Executor {
                                      CurrentTask &current_task)
  {
    BLI_assert(value_to_forward.get() != nullptr);
-    LocalPool<> &allocator = this->get_main_or_local_allocator();
+    LocalPool<> &allocator = this->get_local_allocator();
    const CPPType &type = *value_to_forward.type();

    if (self_.logger_ != nullptr) {
@ -1115,24 +1098,10 @@ class Executor {
    if (BLI_system_thread_count() <= 1) {
      return false;
    }
-    this->ensure_thread_locals();
    task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
    return true;
  }

-  void ensure_thread_locals()
-  {
-#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
-    if (current_main_thread_ != std::this_thread::get_id()) {
-      BLI_assert_unreachable();
-    }
-#endif
-    if (!thread_locals_) {
-      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>(
-          [scope = &local_pool_scope_]() { return ThreadLocalData{*scope}; });
-    }
-  }
-
  /**
   * Allow other threads to steal all the nodes that are currently scheduled on this thread.
   */
@ -1168,15 +1137,12 @@ class Executor {
        });
  }

-  LocalPool<> &get_main_or_local_allocator()
+  LocalPool<> &get_local_allocator()
  {
    if (this->use_multi_threading()) {
-      return thread_locals_->local().local_pool;
+      return context_->pools.pools->local();
    }
-    if (context_ != nullptr && context_->local_pool) {
-      return *context_->local_pool;
-    }
-    return main_allocator_;
+    return *context_->pools.local;
  }
 };

@ -1225,7 +1191,7 @@ class GraphExecutorLFParams final : public Params {
    OutputState &output_state = node_state_.outputs[index];
    BLI_assert(!output_state.has_been_computed);
    if (output_state.value == nullptr) {
-      LocalPool<> &allocator = executor_.get_main_or_local_allocator();
+      LocalPool<> &allocator = executor_.get_local_allocator();
      const CPPType &type = node_.output(index).type();
      output_state.value = allocator.allocate(type.size(), type.alignment());
    }
@ -1282,7 +1248,7 @@ inline void Executor::execute_node(const FunctionNode &node,
  BLI_assert(context_ != nullptr);
  Context fn_context = *context_;
  fn_context.storage = node_state.storage;
-  fn_context.local_pool = &allocator;
+  fn_context.pools.local = &allocator;

  if (self_.logger_ != nullptr) {
    self_.logger_->log_before_node_execute(node, node_params, fn_context);
@ -1339,17 +1305,17 @@ void GraphExecutor::execute_impl(Params &params, const Context &context) const
  executor.execute(params, context);
 }

-void *GraphExecutor::init_storage(LocalPool<> &allocator) const
+void *GraphExecutor::init_storage(Pools &pools) const
 {
-  Executor &executor = *allocator.construct<Executor>(*this).release();
+  Executor &executor = *pools.local->construct<Executor>(*this).release();
  return &executor;
 }

-void GraphExecutor::destruct_storage(void *storage, LocalPool<> &allocator) const
+void GraphExecutor::destruct_storage(void *storage, Pools &pools) const
 {
  Executor *executor = static_cast<Executor *>(storage);
-  executor->destruct_self(allocator);
-  allocator.deallocate(executor, sizeof(Executor), alignof(Executor));
+  executor->destruct_self(pools);
+  pools.local->deallocate(executor, sizeof(Executor), alignof(Executor));
 }

 void GraphExecutorLogger::log_socket_value(const Socket &socket,
--- a/source/blender/modifiers/intern/MOD_nodes.cc
+++ b/source/blender/modifiers/intern/MOD_nodes.cc
@ -1163,8 +1163,12 @@ static GeometrySet compute_geometry(
  blender::bke::ModifierComputeContext modifier_compute_context{nullptr, nmd->modifier.name};
  user_data.compute_context = &modifier_compute_context;

-  blender::LocalPoolScope local_pool_scope;
-  blender::LocalPool<> allocator(local_pool_scope);
+  blender::LocalMemoryPools local_pools;
+  blender::Pools pools;
+  pools.pools = &local_pools;
+  pools.local = &local_pools.local();
+  blender::LocalPool<> &allocator = *pools.local;
+
  Vector<GMutablePointer> inputs_to_destruct;

  int input_index;
@ -1190,8 +1194,9 @@ static GeometrySet compute_geometry(
  }

  lf::Context lf_context;
-  lf_context.storage = graph_executor.init_storage(allocator);
+  lf_context.storage = graph_executor.init_storage(pools);
  lf_context.user_data = &user_data;
+  lf_context.pools = pools;
  lf::BasicParams lf_params{graph_executor,
                            param_inputs,
                            param_outputs,
@ -1199,7 +1204,7 @@ static GeometrySet compute_geometry(
                            param_output_usages,
                            param_set_outputs};
  graph_executor.execute(lf_params, lf_context);
-  graph_executor.destruct_storage(lf_context.storage, allocator);
+  graph_executor.destruct_storage(lf_context.storage, pools);

  for (GMutablePointer &ptr : inputs_to_destruct) {
    ptr.destruct();
@ -1272,6 +1277,7 @@ static void modifyGeometry(ModifierData *md,
                           const ModifierEvalContext *ctx,
                           GeometrySet &geometry_set)
 {
+  SCOPED_TIMER_AVERAGED(__func__);
  NodesModifierData *nmd = reinterpret_cast<NodesModifierData *>(md);
  if (nmd->node_group == nullptr) {
    return;
--- a/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
+++ b/source/blender/nodes/intern/geometry_nodes_lazy_function.cc
@ -689,17 +689,17 @@ class LazyFunctionForGroupNode : public LazyFunction {
    graph_executor_->execute(params, group_context);
  }

-  void *init_storage(LocalPool<> &allocator) const override
+  void *init_storage(Pools &pools) const override
  {
-    Storage *s = allocator.construct<Storage>().release();
-    s->graph_executor_storage = graph_executor_->init_storage(allocator);
+    Storage *s = pools.local->construct<Storage>().release();
+    s->graph_executor_storage = graph_executor_->init_storage(pools);
    return s;
  }

-  void destruct_storage(void *storage, LocalPool<> &allocator) const override
+  void destruct_storage(void *storage, Pools &pools) const override
  {
    Storage *s = static_cast<Storage *>(storage);
-    graph_executor_->destruct_storage(s->graph_executor_storage, allocator);
+    graph_executor_->destruct_storage(s->graph_executor_storage, pools);
    std::destroy_at(s);
  }
 };