BLI: refactor IndexMask for better performance and memory usage #104629

Merged
Jacques Lucke merged 254 commits from JacquesLucke/blender:index-mask-refactor into main 2023-05-24 18:11:47 +02:00
4 changed files with 66 additions and 43 deletions
Showing only changes of commit a65112a3db - Show all commits

View File

@ -167,8 +167,13 @@ class IndexMask {
IndexMask slice(IndexRange range) const;
IndexMask slice(int64_t start, int64_t size) const;
IndexMask slice_and_offset(IndexRange range, IndexMaskMemory &memory) const;
IndexMask slice_and_offset(int64_t start, int64_t size, IndexMaskMemory &memory) const;
IndexMask slice_and_offset(IndexRange range,
const int64_t offset,
IndexMaskMemory &memory) const;
IndexMask slice_and_offset(int64_t start,
int64_t size,
const int64_t offset,
IndexMaskMemory &memory) const;
IndexMask complement(const IndexRange universe, IndexMaskMemory &memory) const;
ChunkSlice chunk(const int64_t chunk_i) const;
@ -188,7 +193,8 @@ class IndexMask {
template<typename Fn> void foreach_index_optimized(GrainSize grain_size, Fn &&fn) const;
template<typename T> static IndexMask from_indices(Span<T> indices, IndexMaskMemory &memory);
static IndexMask from_bits(BitSpan bits, IndexMaskMemory &memory, int64_t offset = 0);
static IndexMask from_bits(BitSpan bits, IndexMaskMemory &memory);
static IndexMask from_bits(const IndexMask &universe, BitSpan bits, IndexMaskMemory &memory);
static IndexMask from_bools(Span<bool> bools, IndexMaskMemory &memory);
static IndexMask from_bools(const VArray<bool> &bools, IndexMaskMemory &memory);
static IndexMask from_bools(const IndexMask &universe,
@ -215,8 +221,8 @@ class IndexMask {
MutableSpan<IndexMask> r_masks);
template<typename T> void to_indices(MutableSpan<T> r_indices) const;
void to_bits(MutableBitSpan r_bits, int64_t offset = 0) const;
void to_bools(MutableSpan<bool> r_bools, int64_t offset = 0) const;
void to_bits(MutableBitSpan r_bits) const;
void to_bools(MutableSpan<bool> r_bools) const;
std::optional<IndexRange> to_range() const;
Vector<IndexRange> to_ranges() const;
Vector<IndexRange> to_ranges_invert(IndexRange universe) const;

View File

@ -233,33 +233,50 @@ void IndexMask::foreach_span_impl(const FunctionRef<void(OffsetSpan<int64_t, int
});
}
IndexMask IndexMask::slice_and_offset(const IndexRange range, IndexMaskMemory &memory) const
IndexMask IndexMask::slice_and_offset(const IndexRange range,
const int64_t offset,
IndexMaskMemory &memory) const
{
return this->slice_and_offset(range.start(), range.size(), memory);
return this->slice_and_offset(range.start(), range.size(), offset, memory);
}
template<typename T>
static IndexMask offset_indices_in_mask(const IndexMask &mask,
const int64_t offset,
IndexMaskMemory &memory)
{
Vector<T, chunk_capacity> indices(mask.size());
mask.to_indices<T>(indices);
threading::parallel_for(indices.index_range(), 2048, [&](const IndexRange range) {
for (T &index : indices.as_mutable_span().slice(range)) {
index += T(offset);
}
});
return IndexMask::from_indices<T>(indices, memory);
}
IndexMask IndexMask::slice_and_offset(const int64_t start,
const int64_t size,
const int64_t offset,
JacquesLucke marked this conversation as resolved Outdated

With the 1/2^14 constant factor, a larger inline buffer here could probably eliminate most allocations. Same below with Vector<IndexMaskSegment> segments

With the `1/2^14` constant factor, a larger inline buffer here could probably eliminate most allocations. Same below with `Vector<IndexMaskSegment> segments`
IndexMaskMemory &memory) const
{
if (size == 0) {
return {};
}
if (this->to_range().has_value()) {
return IndexMask(size);
if (std::optional<IndexRange> range = this->to_range()) {
return range->slice(start, size).shift(offset);
}
const IndexMask sliced_mask = this->slice(start, size);
const int64_t offset = sliced_mask.first();
if (offset == 0) {
return sliced_mask;
}
if (sliced_mask.to_range().has_value()) {
return IndexMask(size);
if (std::optional<IndexRange> range = sliced_mask.to_range()) {
return range->shift(offset);
}
const int64_t range_size = sliced_mask.last() - sliced_mask.first() + 1;
BitVector bits(range_size);
sliced_mask.to_bits(bits, offset);
return IndexMask::from_bits(bits, memory);
if (sliced_mask.last() < INT32_MAX && sliced_mask.last() + offset < INT32_MAX) {
return offset_indices_in_mask<int32_t>(sliced_mask, offset, memory);
}
return offset_indices_in_mask<int64_t>(sliced_mask, offset, memory);
}
Review

Maybe not worth it (also just want to test my understanding)-- couldn't this limit the maximum size of the span argument to find_predicate_begin with something like find_predicate_begin(segment_indices.take_front(max_segment_size + offset),...?

Maybe not worth it (also just want to test my understanding)-- couldn't this limit the maximum size of the span argument to `find_predicate_begin` with something like `find_predicate_begin(segment_indices.take_front(max_segment_size + offset),...`?
Review

Not sure why the + offset, but taking at most max_segment_size makes sense.
In practice it likely doesn't make a difference right now, because the span passed to segments_from_indices is already sliced (for multi-threading).

Not sure why the `+ offset`, but taking at most `max_segment_size` makes sense. In practice it likely doesn't make a difference right now, because the span passed to `segments_from_indices` is already sliced (for multi-threading).
IndexMask IndexMask::complement(const IndexRange universe, IndexMaskMemory &memory) const
@ -417,15 +434,18 @@ IndexMask IndexMask::from_indices(const Span<T> indices, IndexMaskMemory &memory
return mask;
}
IndexMask IndexMask::from_bits(const BitSpan bits, IndexMaskMemory &memory, const int64_t offset)
IndexMask IndexMask::from_bits(const BitSpan bits, IndexMaskMemory &memory)
{
Vector<int64_t> indices;
for (const int64_t i : bits.index_range()) {
if (bits[i]) {
indices.append(i + offset);
}
}
return IndexMask::from_indices<int64_t>(indices, memory);
return IndexMask::from_bits(bits.index_range(), bits, memory);
}
IndexMask IndexMask::from_bits(const IndexMask &universe,
const BitSpan bits,
IndexMaskMemory &memory)
{
return IndexMask::from_predicate(universe, GrainSize(1024), memory, [bits](const int64_t index) {
return bits[index].test();
});
}
IndexMask IndexMask::from_bools(Span<bool> bools, IndexMaskMemory &memory)
@ -443,7 +463,7 @@ IndexMask IndexMask::from_bools(const IndexMask &universe,
IndexMaskMemory &memory)
{
return IndexMask::from_predicate(
universe, GrainSize(1024), memory, [&](const int64_t index) { return bools[index]; });
universe, GrainSize(1024), memory, [bools](const int64_t index) { return bools[index]; });
}
IndexMask IndexMask::from_bools(const IndexMask &universe,
@ -831,28 +851,28 @@ template<typename T> void IndexMask::to_indices(MutableSpan<T> r_indices) const
[&](const int64_t i, const int64_t mask_i) mutable { r_indices[mask_i] = T(i); });
}
void IndexMask::to_bits(MutableBitSpan r_bits, int64_t offset) const
void IndexMask::to_bits(MutableBitSpan r_bits) const
{
BLI_assert(r_bits.size() >= this->min_array_size() - offset);
BLI_assert(r_bits.size() >= this->min_array_size());
r_bits.reset_all();
this->foreach_span_or_range([&](const auto mask_segment) {
if constexpr (std::is_same_v<std::decay_t<decltype(mask_segment)>, IndexRange>) {
const IndexRange range = mask_segment.shift(-offset);
const IndexRange range = mask_segment;
r_bits.slice(range).set_all();
}
else {
for (const int64_t i : mask_segment) {
r_bits[i - offset].set();
r_bits[i].set();
}
}
});
}
void IndexMask::to_bools(MutableSpan<bool> r_bools, int64_t offset) const
void IndexMask::to_bools(MutableSpan<bool> r_bools) const
{
BLI_assert(r_bools.size() >= this->min_array_size() - offset);
BLI_assert(r_bools.size() >= this->min_array_size());
r_bools.fill(false);
this->foreach_index_optimized([&](const int64_t i) { r_bools[i - offset] = true; });
this->foreach_index_optimized(GrainSize(2048), [&](const int64_t i) { r_bools[i] = true; });
}
Vector<IndexRange> IndexMask::to_ranges() const

View File

@ -97,18 +97,14 @@ TEST(index_mask, FromBits)
IndexMaskMemory memory;
const uint64_t bits =
0b0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'1111'0010'0000;
const IndexMask mask = IndexMask::from_bits(BitSpan(&bits, IndexRange(2, 40)), memory, 100);
const IndexMask mask = IndexMask::from_bits(BitSpan(&bits, IndexRange(2, 40)), memory);
Array<int> indices(5);
mask.to_indices<int>(indices);
EXPECT_EQ(indices[0], 103);
EXPECT_EQ(indices[1], 106);
EXPECT_EQ(indices[2], 107);
EXPECT_EQ(indices[3], 108);
EXPECT_EQ(indices[4], 109);
uint64_t new_bits = 0;
mask.to_bits(MutableBitSpan(&new_bits, IndexRange(5, 40)), 100);
EXPECT_EQ(new_bits, bits << 3);
EXPECT_EQ(indices[0], 3);
EXPECT_EQ(indices[1], 6);
EXPECT_EQ(indices[2], 7);
EXPECT_EQ(indices[3], 8);
EXPECT_EQ(indices[4], 9);
}
TEST(index_mask, FromSize)

View File

@ -149,7 +149,8 @@ void MultiFunction::call_auto(const IndexMask &mask, Params params, Context cont
const IndexRange input_slice_range{input_slice_start, input_slice_size};
IndexMaskMemory memory;
const IndexMask offset_mask = mask.slice_and_offset(sub_range, memory);
const int64_t offset = -input_slice_start;
const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset, memory);
ParamsBuilder sliced_params{*this, &offset_mask};
add_sliced_parameters(*signature_ref_, params, input_slice_range, sliced_params);