BLI: avoid invoking tbb for small workloads
We often call `parallel_for` in places with very variable sized workloads. When many elements are processed, using multi-threading is great, but when processing few elements (possibly many times) using `parallel_for` can result in significant overhead. I measured that this improves performance by >20% in the refactored realize instances code I'm working on separately. The change might also help with debugging sometimes, because the stack trace is smaller and contains fewer irrevelant symbols.
This commit is contained in:
@@ -67,14 +67,19 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
|
||||
return;
|
||||
}
|
||||
#ifdef WITH_TBB
|
||||
tbb::parallel_for(tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
|
||||
[&](const tbb::blocked_range<int64_t> &subrange) {
|
||||
function(IndexRange(subrange.begin(), subrange.size()));
|
||||
});
|
||||
/* Invoking tbb for small workloads has a large overhead. */
|
||||
if (range.size() >= grain_size) {
|
||||
tbb::parallel_for(
|
||||
tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
|
||||
[&](const tbb::blocked_range<int64_t> &subrange) {
|
||||
function(IndexRange(subrange.begin(), subrange.size()));
|
||||
});
|
||||
return;
|
||||
}
|
||||
#else
|
||||
UNUSED_VARS(grain_size);
|
||||
function(range);
|
||||
#endif
|
||||
function(range);
|
||||
}
|
||||
|
||||
template<typename Value, typename Function, typename Reduction>
|
||||
|
Reference in New Issue
Block a user