diff --git a/source/blender/blenlib/BLI_task.hh b/source/blender/blenlib/BLI_task.hh index b566a76e542..47690ecfd25 100644 --- a/source/blender/blenlib/BLI_task.hh +++ b/source/blender/blenlib/BLI_task.hh @@ -73,6 +73,10 @@ void parallel_for_weighted_impl(IndexRange range, int64_t grain_size, FunctionRef function, FunctionRef)> task_sizes_fn); +void parallel_for_weighted_impl(IndexRange range, + int64_t grain_size, + FunctionRef function, + FunctionRef task_sizes_fn); void memory_bandwidth_bound_task_impl(FunctionRef function); } // namespace detail @@ -102,7 +106,12 @@ inline void parallel_for(IndexRange range, int64_t grain_size, const Function &f * * \param task_size_fn: Gets the task index as input and computes that tasks size. * \param grain_size: Determines approximately how large a combined task should be. For example, if - * the grain size is 100, then 5 tasks of size 20 fit into it. + * the grain size is 100, then 5 tasks of size 20 fit into it. For different use cases, there can + * be different signatures: + * - `int64_t (int64_t element)` + * - `int64_t (IndexRange elements)` + * If Weight of each element is known as prefix sum, it's better to retrieve total weight of range + * instead of do this per each element. */ template inline void parallel_for_weighted(IndexRange range, @@ -113,14 +122,27 @@ inline void parallel_for_weighted(IndexRange range, if (range.is_empty()) { return; } - detail::parallel_for_weighted_impl( - range, grain_size, function, [&](const IndexRange sub_range, MutableSpan r_sizes) { - for (const int64_t i : sub_range.index_range()) { - const int64_t task_size = task_size_fn(sub_range[i]); - BLI_assert(task_size >= 0); - r_sizes[i] = task_size; - } - }); + if constexpr (std::is_invocable_r_v) { + detail::parallel_for_weighted_impl( + range, + grain_size, + function, + [&](const IndexRange sub_range, MutableSpan r_sizes) { + for (const int64_t i : sub_range.index_range()) { + const int64_t task_size = task_size_fn(sub_range[i]); + BLI_assert(task_size >= 0); + r_sizes[i] = task_size; + } + }); + } + else { + detail::parallel_for_weighted_impl( + range, grain_size, function, [&](const IndexRange sub_range) -> int64_t { + const int64_t range_total = task_size_fn(sub_range); + BLI_assert(range_total >= 0); + return range_total; + }); + } } /** diff --git a/source/blender/blenlib/intern/task_range.cc b/source/blender/blenlib/intern/task_range.cc index 569165a7b33..4c50fa1d965 100644 --- a/source/blender/blenlib/intern/task_range.cc +++ b/source/blender/blenlib/intern/task_range.cc @@ -8,11 +8,14 @@ * Task parallel range functions. */ +#include + #include #include "MEM_guardedalloc.h" #include "BLI_array.hh" +#include "BLI_binary_search.hh" #include "BLI_lazy_threading.hh" #include "BLI_offset_indices.hh" #include "BLI_task.h" @@ -223,6 +226,37 @@ void parallel_for_weighted_impl( }); } +void parallel_for_weighted_impl(const IndexRange range, + const int64_t grain_size, + const FunctionRef function, + const FunctionRef task_sizes_fn) +{ + Vector offsets_vec; + offsets_vec.append(0); + + IndexRange next_range = range; + while (!next_range.is_empty()) { + const int64_t size_of_current_segment = binary_search::find_predicate_begin( + next_range.begin(), next_range.end(), [&](const int64_t i) { + const IndexRange slice = IndexRange::from_begin_end_inclusive(next_range.first(), i); + return task_sizes_fn(slice) > grain_size; + }); + offsets_vec.append(next_range[size_of_current_segment] + 1); + next_range = next_range.drop_front(size_of_current_segment + 1); + } + if (offsets_vec.last() < range.last()) { + offsets_vec.append(range.size()); + } + + const OffsetIndices offsets = offsets_vec.as_span(); + + threading::parallel_for(offsets.index_range(), 1, [&](const IndexRange offsets_range) { + for (const int64_t i : offsets_range) { + function(offsets[i]); + } + }); +} + void memory_bandwidth_bound_task_impl(const FunctionRef function) { #ifdef WITH_TBB diff --git a/source/blender/nodes/geometry/nodes/node_geo_sort_elements.cc b/source/blender/nodes/geometry/nodes/node_geo_sort_elements.cc index 7a62a505928..ad862bac523 100644 --- a/source/blender/nodes/geometry/nodes/node_geo_sort_elements.cc +++ b/source/blender/nodes/geometry/nodes/node_geo_sort_elements.cc @@ -2,8 +2,12 @@ * * SPDX-License-Identifier: GPL-2.0-or-later */ +#include + #include +#include "BLI_timeit.hh" + #include "BKE_attribute.hh" #include "BKE_instances.hh" @@ -32,6 +36,8 @@ static void node_declare(NodeDeclarationBuilder &b) b.add_input("Group ID").field_on_all().hide_value(); b.add_input("Sort Weight").field_on_all().hide_value(); + b.add_input("New"); + b.add_output("Geometry").propagate_all(); } @@ -45,10 +51,11 @@ static void node_init(bNodeTree * /*tree*/, bNode *node) node->custom1 = int(bke::AttrDomain::Point); } -static void grouped_sort(const OffsetIndices offsets, - const Span weights, - MutableSpan indices) +static void grouped_sort_by_element(const OffsetIndices offsets, + const Span weights, + MutableSpan indices) { + SCOPED_TIMER_AVERAGED(__func__); const auto comparator = [&](const int index_a, const int index_b) { const float weight_a = weights[index_a]; const float weight_b = weights[index_b]; @@ -59,12 +66,43 @@ static void grouped_sort(const OffsetIndices offsets, return weight_a < weight_b; }; - threading::parallel_for(offsets.index_range(), 250, [&](const IndexRange range) { - for (const int group_index : range) { - MutableSpan group = indices.slice(offsets[group_index]); - parallel_sort(group.begin(), group.end(), comparator); + threading::parallel_for_weighted( + offsets.index_range(), + 1024, + [&](const IndexRange range) { + for (const int group_index : range) { + MutableSpan group = indices.slice(offsets[group_index]); + parallel_sort(group.begin(), group.end(), comparator); + } + }, + [&](const int64_t element) -> int64_t { return offsets[element].size(); }); +} + +static void grouped_sort_by_range(const OffsetIndices offsets, + const Span weights, + MutableSpan indices) +{ + SCOPED_TIMER_AVERAGED(__func__); + const auto comparator = [&](const int index_a, const int index_b) { + const float weight_a = weights[index_a]; + const float weight_b = weights[index_b]; + if (UNLIKELY(weight_a == weight_b)) { + /* Approach to make it stable. */ + return index_a < index_b; } - }); + return weight_a < weight_b; + }; + + threading::parallel_for_weighted( + offsets.index_range(), + 1024, + [&](const IndexRange range) { + for (const int group_index : range) { + MutableSpan group = indices.slice(offsets[group_index]); + parallel_sort(group.begin(), group.end(), comparator); + } + }, + [&](const IndexRange range) -> int64_t { return offsets[range].size(); }); } static void find_points_by_group_index(const Span indices, @@ -123,7 +161,8 @@ static std::optional> sorted_indices(const fn::FieldContext &field_co const int domain_size, const Field selection_field, const Field group_id_field, - const Field weight_field) + const Field weight_field, + const bool is_new) { if (domain_size == 0) { return std::nullopt; @@ -151,7 +190,12 @@ static std::optional> sorted_indices(const fn::FieldContext &field_co mask.to_indices(gathered_indices); Array weight_span(domain_size); array_utils::copy(weight, mask, weight_span.as_mutable_span()); - grouped_sort(Span({0, int(mask.size())}), weight_span, gathered_indices); + if (is_new) { + grouped_sort_by_range(Span({0, int(mask.size())}), weight_span, gathered_indices); + } + else { + grouped_sort_by_element(Span({0, int(mask.size())}), weight_span, gathered_indices); + } } else { Array gathered_group_id(mask.size()); @@ -162,7 +206,12 @@ static std::optional> sorted_indices(const fn::FieldContext &field_co if (!weight.is_single()) { Array weight_span(mask.size()); array_utils::gather(weight, mask, weight_span.as_mutable_span()); - grouped_sort(offsets_to_sort.as_span(), weight_span, gathered_indices); + if (is_new) { + grouped_sort_by_range(offsets_to_sort.as_span(), weight_span, gathered_indices); + } + else { + grouped_sort_by_element(offsets_to_sort.as_span(), weight_span, gathered_indices); + } } parallel_transform(gathered_indices, 2048, [&](const int pos) { return mask[pos]; }); } @@ -199,6 +248,8 @@ static void node_geo_exec(GeoNodeExecParams params) const Field weight_field = params.extract_input>("Sort Weight"); const bke::AttrDomain domain = bke::AttrDomain(params.node().custom1); + const bool is_new = params.extract_input("New"); + const bke::AnonymousAttributePropagationInfo propagation_info = params.get_output_propagation_info("Geometry"); @@ -213,7 +264,8 @@ static void node_geo_exec(GeoNodeExecParams params) instances->instances_num(), selection_field, group_id_field, - weight_field)) + weight_field, + is_new)) { bke::Instances *result = geometry::reorder_instaces( *instances, *indices, propagation_info); @@ -239,7 +291,8 @@ static void node_geo_exec(GeoNodeExecParams params) src_component->attribute_domain_size(domain), selection_field, group_id_field, - weight_field); + weight_field, + is_new); if (!indices.has_value()) { continue; }