BLI: refactor IndexMask for better performance and memory usage #104629
|
@ -67,7 +67,7 @@ BLI_NOINLINE static void sample_corner_attribute(const Span<MLoopTri> looptris,
|
|||
if constexpr (check_indices) {
|
||||
if (looptri_indices[i] == -1) {
|
||||
dst[i] = {};
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
}
|
||||
const MLoopTri &tri = looptris[looptri_indices[i]];
|
||||
|
@ -144,17 +144,17 @@ static void sample_barycentric_weights(const Span<float3> vert_positions,
|
|||
const IndexMask &mask,
|
||||
MutableSpan<float3> bary_coords)
|
||||
{
|
||||
for (const int i : mask) {
|
||||
mask.foreach_index([&](const int i) {
|
||||
if constexpr (check_indices) {
|
||||
if (looptri_indices[i] == -1) {
|
||||
bary_coords[i] = {};
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
}
|
||||
const MLoopTri &tri = looptris[looptri_indices[i]];
|
||||
bary_coords[i] = compute_bary_coord_in_triangle(
|
||||
vert_positions, corner_verts, tri, sample_positions[i]);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template<bool check_indices = false>
|
||||
|
@ -170,7 +170,7 @@ static void sample_nearest_weights(const Span<float3> vert_positions,
|
|||
if constexpr (check_indices) {
|
||||
if (looptri_indices[i] == -1) {
|
||||
bary_coords[i] = {};
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
}
|
||||
const MLoopTri &tri = looptris[looptri_indices[i]];
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include <optional>
|
||||
#include <variant>
|
||||
|
||||
#include "BLI_array.hh"
|
||||
#include "BLI_bit_vector.hh"
|
||||
#include "BLI_function_ref.hh"
|
||||
#include "BLI_index_range.hh"
|
||||
|
@ -201,6 +202,11 @@ class IndexMask {
|
|||
GrainSize grain_size,
|
||||
IndexMaskMemory &memory,
|
||||
Fn &&predicate);
|
||||
template<typename T, typename Fn>
|
||||
static void from_groups(const IndexMask &universe,
|
||||
IndexMaskMemory &memory,
|
||||
Fn &&get_group_index,
|
||||
MutableSpan<IndexMask> r_masks);
|
||||
|
||||
template<typename T> void to_indices(MutableSpan<T> r_indices) const;
|
||||
void to_bits(MutableBitSpan r_bits, int64_t offset = 0) const;
|
||||
|
@ -696,7 +702,8 @@ template<typename Fn> inline void ChunkSlice::foreach_span(Fn &&fn) const
|
|||
fn(indices);
|
||||
}
|
||||
for (int64_t segment_i = this->begin_it.segment_i + 1; segment_i < this->end_it.segment_i;
|
||||
segment_i++) {
|
||||
segment_i++)
|
||||
{
|
||||
const int64_t begin_i = 0;
|
||||
const int64_t end_i = this->chunk->segment_size(segment_i);
|
||||
const Span<int16_t> indices{this->chunk->indices_by_segment[segment_i] + begin_i,
|
||||
|
@ -773,8 +780,8 @@ template<typename Fn> inline void IndexMask::foreach_span_template(Fn &&fn) cons
|
|||
chunk.indices_by_segment[segment_i] + begin_it.index_in_segment, segment_size};
|
||||
fn(chunk_id, indices);
|
||||
}
|
||||
for (int64_t segment_i = begin_it.segment_i + 1; segment_i < chunk.segments_num;
|
||||
segment_i++) {
|
||||
for (int64_t segment_i = begin_it.segment_i + 1; segment_i < chunk.segments_num; segment_i++)
|
||||
{
|
||||
const int64_t segment_size = chunk.cumulative_segment_sizes[segment_i + 1] -
|
||||
chunk.cumulative_segment_sizes[segment_i];
|
||||
const Span<int16_t> indices{chunk.indices_by_segment[segment_i], segment_size};
|
||||
|
@ -1016,6 +1023,22 @@ inline IndexMask IndexMask::from_predicate(const IndexMask &universe,
|
|||
return IndexMask::from_indices<int64_t>(indices, memory);
|
||||
}
|
||||
|
||||
template<typename T, typename Fn>
|
||||
void IndexMask::from_groups(const IndexMask &universe,
|
||||
IndexMaskMemory &memory,
|
||||
Fn &&get_group_index,
|
||||
MutableSpan<IndexMask> r_masks)
|
||||
{
|
||||
Array<Vector<T>> indices_by_group(r_masks.size());
|
||||
universe.foreach_index([&](const int64_t i) {
|
||||
const int group_index = get_group_index(i);
|
||||
indices_by_group[group_index].append(T(i));
|
||||
});
|
||||
for (const int64_t i : r_masks.index_range()) {
|
||||
r_masks[i] = IndexMask::from_indices<T>(indices_by_group[i], memory);
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<IndexRange> inline IndexMask::to_range() const
|
||||
{
|
||||
if (data_.indices_num == 0) {
|
||||
|
|
|
@ -23,9 +23,7 @@ class AddPrefixFunction : public MultiFunction {
|
|||
const VArray<std::string> &prefixes = params.readonly_single_input<std::string>(0, "Prefix");
|
||||
MutableSpan<std::string> strings = params.single_mutable<std::string>(1, "Strings");
|
||||
|
||||
for (int64_t i : mask) {
|
||||
strings[i] = prefixes[i] + strings[i];
|
||||
}
|
||||
mask.foreach_index([&](const int64_t i) { strings[i] = prefixes[i] + strings[i]; });
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -48,12 +46,12 @@ class CreateRangeFunction : public MultiFunction {
|
|||
const VArray<int> &sizes = params.readonly_single_input<int>(0, "Size");
|
||||
GVectorArray &ranges = params.vector_output(1, "Range");
|
||||
|
||||
for (int64_t i : mask) {
|
||||
mask.foreach_index([&](const int64_t i) {
|
||||
int size = sizes[i];
|
||||
for (int j : IndexRange(size)) {
|
||||
ranges.append(i, &j);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -75,12 +73,12 @@ class GenericAppendFunction : public MultiFunction {
|
|||
GVectorArray &vectors = params.vector_mutable(0, "Vector");
|
||||
const GVArray &values = params.readonly_single_input(1, "Value");
|
||||
|
||||
for (int64_t i : mask) {
|
||||
mask.foreach_index([&](const int64_t i) {
|
||||
BUFFER_FOR_CPP_TYPE_VALUE(values.type(), buffer);
|
||||
values.get(i, buffer);
|
||||
vectors.append(i, buffer);
|
||||
values.type().destruct(buffer);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -125,9 +123,7 @@ class AppendFunction : public MultiFunction {
|
|||
GVectorArray_TypedMutableRef<int> vectors = params.vector_mutable<int>(0);
|
||||
const VArray<int> &values = params.readonly_single_input<int>(1);
|
||||
|
||||
for (int64_t i : mask) {
|
||||
vectors.append(i, values[i]);
|
||||
}
|
||||
mask.foreach_index([&](const int64_t i) { vectors.append(i, values[i]); });
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -150,13 +146,13 @@ class SumVectorFunction : public MultiFunction {
|
|||
const VVectorArray<int> &vectors = params.readonly_vector_input<int>(0);
|
||||
MutableSpan<int> sums = params.uninitialized_single_output<int>(1);
|
||||
|
||||
for (int64_t i : mask) {
|
||||
mask.foreach_index([&](const int64_t i) {
|
||||
int sum = 0;
|
||||
for (int j : IndexRange(vectors.get_vector_size(i))) {
|
||||
sum += vectors.get_vector_element(i, j);
|
||||
}
|
||||
sums[i] = sum;
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -181,9 +177,8 @@ class OptionalOutputsFunction : public MultiFunction {
|
|||
index_mask::masked_fill(values, 5, mask);
|
||||
}
|
||||
MutableSpan<std::string> values = params.uninitialized_single_output<std::string>(1, "Out 2");
|
||||
for (const int i : mask) {
|
||||
new (&values[i]) std::string("hello, this is a long string");
|
||||
}
|
||||
mask.foreach_index(
|
||||
[&](const int i) { new (&values[i]) std::string("hello, this is a long string"); });
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -398,7 +398,7 @@ class SampleCurveFunction : public mf::MultiFunction {
|
|||
Vector<int> invalid_indices;
|
||||
MultiValueMap<int, int> indices_per_curve;
|
||||
devirtualize_varray(curve_indices, [&](const auto curve_indices) {
|
||||
for (const int i : mask) {
|
||||
mask.foreach_index([&](const int i) {
|
||||
const int curve_i = curve_indices[i];
|
||||
if (curves.curves_range().contains(curve_i)) {
|
||||
indices_per_curve.add(curve_i, i);
|
||||
|
@ -406,7 +406,7 @@ class SampleCurveFunction : public mf::MultiFunction {
|
|||
else {
|
||||
invalid_indices.append(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
IndexMaskMemory memory;
|
||||
|
|
|
@ -21,9 +21,8 @@ static void node_declare(NodeDeclarationBuilder &b)
|
|||
static KDTree_3d *build_kdtree(const Span<float3> positions, const IndexMask &mask)
|
||||
{
|
||||
KDTree_3d *tree = BLI_kdtree_3d_new(mask.size());
|
||||
for (const int index : mask) {
|
||||
BLI_kdtree_3d_insert(tree, index, positions[index]);
|
||||
}
|
||||
mask.foreach_index(
|
||||
[&](const int index) { BLI_kdtree_3d_insert(tree, index, positions[index]); });
|
||||
BLI_kdtree_3d_balance(tree);
|
||||
return tree;
|
||||
}
|
||||
|
@ -41,10 +40,8 @@ static void find_neighbors(const KDTree_3d &tree,
|
|||
const IndexMask &mask,
|
||||
MutableSpan<int> r_indices)
|
||||
{
|
||||
threading::parallel_for(mask.index_range(), 1024, [&](const IndexRange range) {
|
||||
for (const int index : mask.slice(range)) {
|
||||
r_indices[index] = find_nearest_non_self(tree, positions[index], index);
|
||||
}
|
||||
mask.foreach_index(GrainSize(1024), [&](const int index) {
|
||||
r_indices[index] = find_nearest_non_self(tree, positions[index], index);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -87,58 +84,38 @@ class IndexOfNearestFieldInput final : public bke::GeometryFieldInput {
|
|||
const VArraySpan<int> group_ids_span(group_ids);
|
||||
|
||||
VectorSet<int> group_indexing;
|
||||
for (const int index : mask) {
|
||||
for (const int index : IndexRange(domain_size)) {
|
||||
const int group_id = group_ids_span[index];
|
||||
group_indexing.add(group_id);
|
||||
}
|
||||
const int groups_num = group_indexing.size();
|
||||
|
||||
/* Each group ID has two corresponding index masks. One that contains all the points
|
||||
* in each group and one that contains all the points in the group that should be looked up
|
||||
* (the intersection of the points in the group and `mask`). In many cases, both of these
|
||||
* masks are the same or very similar, so there is not enough benefit for a separate mask
|
||||
* for the lookups. */
|
||||
const bool use_separate_lookup_indices = mask.size() < domain_size / 2;
|
||||
IndexMaskMemory mask_memory;
|
||||
Array<IndexMask> all_indices_by_group_id(groups_num);
|
||||
Array<IndexMask> lookup_indices_by_group_id(groups_num);
|
||||
|
||||
Array<Vector<int64_t>> all_indices_by_group_id(group_indexing.size());
|
||||
Array<Vector<int64_t>> lookup_indices_by_group_id;
|
||||
|
||||
if (use_separate_lookup_indices) {
|
||||
result.reinitialize(mask.min_array_size());
|
||||
lookup_indices_by_group_id.reinitialize(group_indexing.size());
|
||||
}
|
||||
else {
|
||||
result.reinitialize(domain_size);
|
||||
}
|
||||
|
||||
const auto build_group_masks = [&](const IndexMask &mask,
|
||||
MutableSpan<Vector<int64_t>> r_groups) {
|
||||
mask.foreach_index([&](const int index) {
|
||||
const int group_id = group_ids_span[index];
|
||||
const int index_of_group = group_indexing.index_of_try(group_id);
|
||||
if (index_of_group != -1) {
|
||||
r_groups[index_of_group].append(index);
|
||||
}
|
||||
});
|
||||
const auto get_group_index = [&](const int i) {
|
||||
const int group_id = group_ids_span[i];
|
||||
return group_indexing.index_of(group_id);
|
||||
};
|
||||
|
||||
threading::parallel_invoke(
|
||||
domain_size > 1024 && use_separate_lookup_indices,
|
||||
[&]() {
|
||||
if (use_separate_lookup_indices) {
|
||||
build_group_masks(mask, lookup_indices_by_group_id);
|
||||
}
|
||||
},
|
||||
[&]() { build_group_masks(IndexMask(domain_size), all_indices_by_group_id); });
|
||||
IndexMask::from_groups<int>(
|
||||
IndexMask(domain_size), mask_memory, get_group_index, all_indices_by_group_id);
|
||||
|
||||
if (mask.size() == domain_size) {
|
||||
lookup_indices_by_group_id = all_indices_by_group_id;
|
||||
}
|
||||
else {
|
||||
IndexMask::from_groups<int>(mask, mask_memory, get_group_index, all_indices_by_group_id);
|
||||
}
|
||||
|
||||
/* The grain size should be larger as each tree gets smaller. */
|
||||
const int avg_tree_size = domain_size / group_indexing.size();
|
||||
const int grain_size = std::max(8192 / avg_tree_size, 1);
|
||||
threading::parallel_for(group_indexing.index_range(), grain_size, [&](const IndexRange range) {
|
||||
for (const int index : range) {
|
||||
const IndexMask tree_mask = all_indices_by_group_id[index].as_span();
|
||||
const IndexMask lookup_mask = use_separate_lookup_indices ?
|
||||
IndexMask(lookup_indices_by_group_id[index]) :
|
||||
tree_mask;
|
||||
threading::parallel_for(IndexRange(groups_num), grain_size, [&](const IndexRange range) {
|
||||
for (const int group_index : range) {
|
||||
const IndexMask &tree_mask = all_indices_by_group_id[group_index];
|
||||
const IndexMask &lookup_mask = lookup_indices_by_group_id[group_index];
|
||||
KDTree_3d *tree = build_kdtree(positions, tree_mask);
|
||||
find_neighbors(*tree, positions, lookup_mask, result);
|
||||
BLI_kdtree_3d_free(tree);
|
||||
|
|
|
@ -137,7 +137,7 @@ static void raycast_to_mesh(const IndexMask &mask,
|
|||
/* We shouldn't be rebuilding the BVH tree when calling this function in parallel. */
|
||||
BLI_assert(tree_data.cached);
|
||||
|
||||
for (const int i : mask) {
|
||||
mask.foreach_index([&](const int i) {
|
||||
const float ray_length = ray_lengths[i];
|
||||
const float3 ray_origin = ray_origins[i];
|
||||
const float3 ray_direction = ray_directions[i];
|
||||
|
@ -187,7 +187,7 @@ static void raycast_to_mesh(const IndexMask &mask,
|
|||
r_hit_distances[i] = ray_length;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
class RaycastFunction : public mf::MultiFunction {
|
||||
|
|
|
@ -21,15 +21,13 @@ void copy_with_checked_indices(const VArray<T> &src,
|
|||
{
|
||||
const IndexRange src_range = src.index_range();
|
||||
devirtualize_varray2(src, indices, [&](const auto src, const auto indices) {
|
||||
threading::parallel_for(mask.index_range(), 4096, [&](IndexRange range) {
|
||||
for (const int i : mask.slice(range)) {
|
||||
const int index = indices[i];
|
||||
if (src_range.contains(index)) {
|
||||
dst[i] = src[index];
|
||||
}
|
||||
else {
|
||||
dst[i] = {};
|
||||
}
|
||||
mask.foreach_index(GrainSize(4096), [&](const int i) {
|
||||
const int index = indices[i];
|
||||
if (src_range.contains(index)) {
|
||||
dst[i] = src[index];
|
||||
}
|
||||
else {
|
||||
dst[i] = {};
|
||||
}
|
||||
});
|
||||
});
|
||||
|
@ -177,11 +175,9 @@ void copy_with_clamped_indices(const VArray<T> &src,
|
|||
{
|
||||
const int last_index = src.index_range().last();
|
||||
devirtualize_varray2(src, indices, [&](const auto src, const auto indices) {
|
||||
threading::parallel_for(mask.index_range(), 4096, [&](IndexRange range) {
|
||||
for (const int i : mask.slice(range)) {
|
||||
const int index = indices[i];
|
||||
dst[i] = src[std::clamp(index, 0, last_index)];
|
||||
}
|
||||
mask.foreach_index(GrainSize(4096), [&](const int i) {
|
||||
const int index = indices[i];
|
||||
dst[i] = src[std::clamp(index, 0, last_index)];
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
This
memory
will fill up while processing in the for loop, sincefrom_indices
doesn't clear the existing memory. Maybe better to declare it inside the loop? Or maybe not, hrmm...Actually, maybe I'll just try to replace this with the same
from_groups
thing from elsewhere.Yeah, difficult, will also leave that for later for now. It's probably good to refactor this a bit more like you mentioned.