BLI: refactor IndexMask for better performance and memory usage #104629

Merged
Jacques Lucke merged 254 commits from JacquesLucke/blender:index-mask-refactor into main 2023-05-24 18:11:47 +02:00
7 changed files with 80 additions and 89 deletions
Showing only changes of commit be9bbdb13b - Show all commits

View File

@ -67,7 +67,7 @@ BLI_NOINLINE static void sample_corner_attribute(const Span<MLoopTri> looptris,
if constexpr (check_indices) {
if (looptri_indices[i] == -1) {
dst[i] = {};
continue;
return;
}
}
const MLoopTri &tri = looptris[looptri_indices[i]];
@ -144,17 +144,17 @@ static void sample_barycentric_weights(const Span<float3> vert_positions,
const IndexMask &mask,
MutableSpan<float3> bary_coords)
{
for (const int i : mask) {
mask.foreach_index([&](const int i) {
if constexpr (check_indices) {
if (looptri_indices[i] == -1) {
bary_coords[i] = {};
continue;
return;
}
}
const MLoopTri &tri = looptris[looptri_indices[i]];
bary_coords[i] = compute_bary_coord_in_triangle(
vert_positions, corner_verts, tri, sample_positions[i]);
}
});
}
template<bool check_indices = false>
@ -170,7 +170,7 @@ static void sample_nearest_weights(const Span<float3> vert_positions,
if constexpr (check_indices) {
if (looptri_indices[i] == -1) {
bary_coords[i] = {};
continue;
return;
}
}
const MLoopTri &tri = looptris[looptri_indices[i]];

View File

@ -6,6 +6,7 @@
#include <optional>
#include <variant>
#include "BLI_array.hh"
#include "BLI_bit_vector.hh"
#include "BLI_function_ref.hh"
#include "BLI_index_range.hh"
@ -201,6 +202,11 @@ class IndexMask {
GrainSize grain_size,
IndexMaskMemory &memory,
Fn &&predicate);
template<typename T, typename Fn>
static void from_groups(const IndexMask &universe,
IndexMaskMemory &memory,
Fn &&get_group_index,
MutableSpan<IndexMask> r_masks);
template<typename T> void to_indices(MutableSpan<T> r_indices) const;
void to_bits(MutableBitSpan r_bits, int64_t offset = 0) const;
@ -696,7 +702,8 @@ template<typename Fn> inline void ChunkSlice::foreach_span(Fn &&fn) const
fn(indices);
}
for (int64_t segment_i = this->begin_it.segment_i + 1; segment_i < this->end_it.segment_i;
segment_i++) {
segment_i++)
{
const int64_t begin_i = 0;
const int64_t end_i = this->chunk->segment_size(segment_i);
const Span<int16_t> indices{this->chunk->indices_by_segment[segment_i] + begin_i,
@ -773,8 +780,8 @@ template<typename Fn> inline void IndexMask::foreach_span_template(Fn &&fn) cons
chunk.indices_by_segment[segment_i] + begin_it.index_in_segment, segment_size};
fn(chunk_id, indices);
}
for (int64_t segment_i = begin_it.segment_i + 1; segment_i < chunk.segments_num;
segment_i++) {
for (int64_t segment_i = begin_it.segment_i + 1; segment_i < chunk.segments_num; segment_i++)
{
const int64_t segment_size = chunk.cumulative_segment_sizes[segment_i + 1] -
chunk.cumulative_segment_sizes[segment_i];
const Span<int16_t> indices{chunk.indices_by_segment[segment_i], segment_size};
@ -1016,6 +1023,22 @@ inline IndexMask IndexMask::from_predicate(const IndexMask &universe,
return IndexMask::from_indices<int64_t>(indices, memory);
}
template<typename T, typename Fn>
void IndexMask::from_groups(const IndexMask &universe,
IndexMaskMemory &memory,
Fn &&get_group_index,
MutableSpan<IndexMask> r_masks)
{
Array<Vector<T>> indices_by_group(r_masks.size());
universe.foreach_index([&](const int64_t i) {
const int group_index = get_group_index(i);
indices_by_group[group_index].append(T(i));
});
for (const int64_t i : r_masks.index_range()) {
r_masks[i] = IndexMask::from_indices<T>(indices_by_group[i], memory);
}
}
std::optional<IndexRange> inline IndexMask::to_range() const
{
if (data_.indices_num == 0) {

View File

@ -23,9 +23,7 @@ class AddPrefixFunction : public MultiFunction {
const VArray<std::string> &prefixes = params.readonly_single_input<std::string>(0, "Prefix");
MutableSpan<std::string> strings = params.single_mutable<std::string>(1, "Strings");
for (int64_t i : mask) {
strings[i] = prefixes[i] + strings[i];
}
mask.foreach_index([&](const int64_t i) { strings[i] = prefixes[i] + strings[i]; });
}
};
@ -48,12 +46,12 @@ class CreateRangeFunction : public MultiFunction {
const VArray<int> &sizes = params.readonly_single_input<int>(0, "Size");
GVectorArray &ranges = params.vector_output(1, "Range");
for (int64_t i : mask) {
mask.foreach_index([&](const int64_t i) {
int size = sizes[i];
for (int j : IndexRange(size)) {
ranges.append(i, &j);
}
}
});
}
};
@ -75,12 +73,12 @@ class GenericAppendFunction : public MultiFunction {
GVectorArray &vectors = params.vector_mutable(0, "Vector");
const GVArray &values = params.readonly_single_input(1, "Value");
for (int64_t i : mask) {
mask.foreach_index([&](const int64_t i) {
BUFFER_FOR_CPP_TYPE_VALUE(values.type(), buffer);
values.get(i, buffer);
vectors.append(i, buffer);
values.type().destruct(buffer);
}
});
}
};
@ -125,9 +123,7 @@ class AppendFunction : public MultiFunction {
GVectorArray_TypedMutableRef<int> vectors = params.vector_mutable<int>(0);
const VArray<int> &values = params.readonly_single_input<int>(1);
for (int64_t i : mask) {
vectors.append(i, values[i]);
}
mask.foreach_index([&](const int64_t i) { vectors.append(i, values[i]); });
}
};
@ -150,13 +146,13 @@ class SumVectorFunction : public MultiFunction {
const VVectorArray<int> &vectors = params.readonly_vector_input<int>(0);
MutableSpan<int> sums = params.uninitialized_single_output<int>(1);
for (int64_t i : mask) {
mask.foreach_index([&](const int64_t i) {
int sum = 0;
for (int j : IndexRange(vectors.get_vector_size(i))) {
sum += vectors.get_vector_element(i, j);
}
sums[i] = sum;
}
});
}
};
@ -181,9 +177,8 @@ class OptionalOutputsFunction : public MultiFunction {
index_mask::masked_fill(values, 5, mask);
}
MutableSpan<std::string> values = params.uninitialized_single_output<std::string>(1, "Out 2");
for (const int i : mask) {
new (&values[i]) std::string("hello, this is a long string");
}
mask.foreach_index(
[&](const int i) { new (&values[i]) std::string("hello, this is a long string"); });
}
};

View File

@ -398,7 +398,7 @@ class SampleCurveFunction : public mf::MultiFunction {
Vector<int> invalid_indices;
MultiValueMap<int, int> indices_per_curve;
devirtualize_varray(curve_indices, [&](const auto curve_indices) {
for (const int i : mask) {
mask.foreach_index([&](const int i) {
const int curve_i = curve_indices[i];
if (curves.curves_range().contains(curve_i)) {
indices_per_curve.add(curve_i, i);
@ -406,7 +406,7 @@ class SampleCurveFunction : public mf::MultiFunction {
else {
invalid_indices.append(i);
}
}
});
});
Review

This memory will fill up while processing in the for loop, since from_indices doesn't clear the existing memory. Maybe better to declare it inside the loop? Or maybe not, hrmm...

Actually, maybe I'll just try to replace this with the same from_groups thing from elsewhere.

This `memory` will fill up while processing in the for loop, since `from_indices` doesn't clear the existing memory. Maybe better to declare it inside the loop? Or maybe not, hrmm... Actually, maybe I'll just try to replace this with the same `from_groups` thing from elsewhere.
Review

Yeah, difficult, will also leave that for later for now. It's probably good to refactor this a bit more like you mentioned.

Yeah, difficult, will also leave that for later for now. It's probably good to refactor this a bit more like you mentioned.
IndexMaskMemory memory;

View File

@ -21,9 +21,8 @@ static void node_declare(NodeDeclarationBuilder &b)
static KDTree_3d *build_kdtree(const Span<float3> positions, const IndexMask &mask)
{
KDTree_3d *tree = BLI_kdtree_3d_new(mask.size());
for (const int index : mask) {
BLI_kdtree_3d_insert(tree, index, positions[index]);
}
mask.foreach_index(
[&](const int index) { BLI_kdtree_3d_insert(tree, index, positions[index]); });
BLI_kdtree_3d_balance(tree);
return tree;
}
@ -41,10 +40,8 @@ static void find_neighbors(const KDTree_3d &tree,
const IndexMask &mask,
MutableSpan<int> r_indices)
{
threading::parallel_for(mask.index_range(), 1024, [&](const IndexRange range) {
for (const int index : mask.slice(range)) {
r_indices[index] = find_nearest_non_self(tree, positions[index], index);
}
mask.foreach_index(GrainSize(1024), [&](const int index) {
r_indices[index] = find_nearest_non_self(tree, positions[index], index);
});
}
@ -87,58 +84,38 @@ class IndexOfNearestFieldInput final : public bke::GeometryFieldInput {
const VArraySpan<int> group_ids_span(group_ids);
VectorSet<int> group_indexing;
for (const int index : mask) {
for (const int index : IndexRange(domain_size)) {
const int group_id = group_ids_span[index];
group_indexing.add(group_id);
}
const int groups_num = group_indexing.size();
/* Each group ID has two corresponding index masks. One that contains all the points
* in each group and one that contains all the points in the group that should be looked up
* (the intersection of the points in the group and `mask`). In many cases, both of these
* masks are the same or very similar, so there is not enough benefit for a separate mask
* for the lookups. */
const bool use_separate_lookup_indices = mask.size() < domain_size / 2;
IndexMaskMemory mask_memory;
Array<IndexMask> all_indices_by_group_id(groups_num);
Array<IndexMask> lookup_indices_by_group_id(groups_num);
Array<Vector<int64_t>> all_indices_by_group_id(group_indexing.size());
Array<Vector<int64_t>> lookup_indices_by_group_id;
if (use_separate_lookup_indices) {
result.reinitialize(mask.min_array_size());
lookup_indices_by_group_id.reinitialize(group_indexing.size());
}
else {
result.reinitialize(domain_size);
}
const auto build_group_masks = [&](const IndexMask &mask,
MutableSpan<Vector<int64_t>> r_groups) {
mask.foreach_index([&](const int index) {
const int group_id = group_ids_span[index];
const int index_of_group = group_indexing.index_of_try(group_id);
if (index_of_group != -1) {
r_groups[index_of_group].append(index);
}
});
const auto get_group_index = [&](const int i) {
const int group_id = group_ids_span[i];
return group_indexing.index_of(group_id);
};
threading::parallel_invoke(
domain_size > 1024 && use_separate_lookup_indices,
[&]() {
if (use_separate_lookup_indices) {
build_group_masks(mask, lookup_indices_by_group_id);
}
},
[&]() { build_group_masks(IndexMask(domain_size), all_indices_by_group_id); });
IndexMask::from_groups<int>(
IndexMask(domain_size), mask_memory, get_group_index, all_indices_by_group_id);
if (mask.size() == domain_size) {
lookup_indices_by_group_id = all_indices_by_group_id;
}
else {
IndexMask::from_groups<int>(mask, mask_memory, get_group_index, all_indices_by_group_id);
}
/* The grain size should be larger as each tree gets smaller. */
const int avg_tree_size = domain_size / group_indexing.size();
const int grain_size = std::max(8192 / avg_tree_size, 1);
threading::parallel_for(group_indexing.index_range(), grain_size, [&](const IndexRange range) {
for (const int index : range) {
const IndexMask tree_mask = all_indices_by_group_id[index].as_span();
const IndexMask lookup_mask = use_separate_lookup_indices ?
IndexMask(lookup_indices_by_group_id[index]) :
tree_mask;
threading::parallel_for(IndexRange(groups_num), grain_size, [&](const IndexRange range) {
for (const int group_index : range) {
const IndexMask &tree_mask = all_indices_by_group_id[group_index];
const IndexMask &lookup_mask = lookup_indices_by_group_id[group_index];
KDTree_3d *tree = build_kdtree(positions, tree_mask);
find_neighbors(*tree, positions, lookup_mask, result);
BLI_kdtree_3d_free(tree);

View File

@ -137,7 +137,7 @@ static void raycast_to_mesh(const IndexMask &mask,
/* We shouldn't be rebuilding the BVH tree when calling this function in parallel. */
BLI_assert(tree_data.cached);
for (const int i : mask) {
mask.foreach_index([&](const int i) {
const float ray_length = ray_lengths[i];
const float3 ray_origin = ray_origins[i];
const float3 ray_direction = ray_directions[i];
@ -187,7 +187,7 @@ static void raycast_to_mesh(const IndexMask &mask,
r_hit_distances[i] = ray_length;
}
}
}
});
}
class RaycastFunction : public mf::MultiFunction {

View File

@ -21,15 +21,13 @@ void copy_with_checked_indices(const VArray<T> &src,
{
const IndexRange src_range = src.index_range();
devirtualize_varray2(src, indices, [&](const auto src, const auto indices) {
threading::parallel_for(mask.index_range(), 4096, [&](IndexRange range) {
for (const int i : mask.slice(range)) {
const int index = indices[i];
if (src_range.contains(index)) {
dst[i] = src[index];
}
else {
dst[i] = {};
}
mask.foreach_index(GrainSize(4096), [&](const int i) {
const int index = indices[i];
if (src_range.contains(index)) {
dst[i] = src[index];
}
else {
dst[i] = {};
}
});
});
@ -177,11 +175,9 @@ void copy_with_clamped_indices(const VArray<T> &src,
{
const int last_index = src.index_range().last();
devirtualize_varray2(src, indices, [&](const auto src, const auto indices) {
threading::parallel_for(mask.index_range(), 4096, [&](IndexRange range) {
for (const int i : mask.slice(range)) {
const int index = indices[i];
dst[i] = src[std::clamp(index, 0, last_index)];
}
mask.foreach_index(GrainSize(4096), [&](const int i) {
const int index = indices[i];
dst[i] = src[std::clamp(index, 0, last_index)];
});
});
}