Geometry Nodes: avoid using enumerable thread specific on single thread

The geometry nodes evaluator supports "lazy threading", i.e. it starts out single-threaded. But when it determines that multi-threading can be benefitial, it switches to multi-threaded mode. Now it only creates an enumerable-thread-specific if it is actually using multiple threads. This results in a 6% speedup in my test file with many node groups and math nodes.
2022-12-29 21:05:41 +01:00
parent c744d5453f
commit dba2d82846
1 changed files with 33 additions and 9 deletions
--- a/source/blender/functions/intern/lazy_function_graph_executor.cc
+++ b/source/blender/functions/intern/lazy_function_graph_executor.cc
@@ -245,8 +245,11 @@ class Executor {
   * A separate linear allocator for every thread. We could potentially reuse some memory, but that
   * doesn't seem worth it yet.
   */
-  threading::EnumerableThreadSpecific<LinearAllocator<>> local_allocators_;
+  struct ThreadLocalData {
-  LinearAllocator<> *main_local_allocator_ = nullptr;
+    LinearAllocator<> allocator;
  };
  std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
  LinearAllocator<> main_allocator_;
  /**
   * Set to false when the first execution ends.
   */
@@ -259,7 +262,6 @@ class Executor {
  {
    /* The indices are necessary, because they are used as keys in #node_states_. */
    BLI_assert(self_.graph_.node_indices_are_valid());
    main_local_allocator_ = &local_allocators_.local();
  }
  ~Executor()
@@ -338,17 +340,26 @@ class Executor {
    Span<const Node *> nodes = self_.graph_.nodes();
    node_states_.reinitialize(nodes.size());
-    /* Construct all node states in parallel. */
+    auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
    threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
      LinearAllocator<> &allocator = local_allocators_.local();
      for (const int i : range) {
        const Node &node = *nodes[i];
        NodeState &node_state = *allocator.construct<NodeState>().release();
        node_states_[i] = &node_state;
        this->construct_initial_node_state(allocator, node, node_state);
      }
    };
    if (nodes.size() <= 256) {
      construct_node_range(nodes.index_range(), main_allocator_);
    }
    else {
      this->ensure_thread_locals();
      /* Construct all node states in parallel. */
      threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
        LinearAllocator<> &allocator = this->get_main_or_local_allocator();
        construct_node_range(range, allocator);
      });
    }
  }
  void construct_initial_node_state(LinearAllocator<> &allocator,
                                    const Node &node,
@@ -1067,10 +1078,23 @@ class Executor {
    if (BLI_system_thread_count() <= 1) {
      return false;
    }
    this->ensure_thread_locals();
    task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
    return true;
  }
  void ensure_thread_locals()
  {
 #ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
    if (current_main_thread_ != std::this_thread::get_id()) {
      BLI_assert_unreachable();
    }
 #endif
    if (!thread_locals_) {
      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
    }
  }
  /**
   * Allow other threads to steal all the nodes that are currently scheduled on this thread.
   */
@@ -1109,9 +1133,9 @@ class Executor {
  LinearAllocator<> &get_main_or_local_allocator()
  {
    if (this->use_multi_threading()) {
-      return local_allocators_.local();
+      return thread_locals_->local().allocator;
    }
-    return *main_local_allocator_;
+    return main_allocator_;
  }
 };