BLI: Improve IndexMask::complement() performance #108331

Merged
Hans Goudey merged 15 commits from HooglyBoogly/blender:index-mask-complement-performance into main 2023-05-31 17:11:11 +02:00
2 changed files with 221 additions and 8 deletions

View File

@ -150,14 +150,6 @@ IndexMask IndexMask::slice_and_offset(const int64_t start,
return sliced_mask;
}
IndexMask IndexMask::complement(const IndexRange universe, IndexMaskMemory &memory) const
{
/* TODO: Implement more efficient solution. */
return IndexMask::from_predicate(universe, GrainSize(512), memory, [&](const int64_t index) {
return !this->contains(index);
});
}
/**
* Merges consecutive segments in some cases. Having fewer but larger segments generally allows for
* better performance when using the mask later on.
@ -330,6 +322,168 @@ struct ParallelSegmentsCollector {
}
};
/**
* Convert a range to potentially multiple index mask segments.
*/
static void range_to_segments(const IndexRange range, Vector<IndexMaskSegment, 16> &r_segments)
{
const Span<int16_t> static_indices = get_static_indices_array();
for (int64_t start = 0; start < range.size(); start += max_segment_size) {
const int64_t size = std::min(max_segment_size, range.size() - start);
r_segments.append_as(range.start() + start, static_indices.take_front(size));
}
}
static int64_t get_size_before_gap(const Span<int16_t> indices)
{
BLI_assert(indices.size() >= 2);
if (indices[1] > indices[0] + 1) {
/* For sparse indices, often the next gap is just after the next index.
* In this case we can skip the logarithmic check below.*/
return 1;
}
return unique_sorted_indices::find_size_of_next_range(indices);
}
static void inverted_indices_to_segments(const IndexMaskSegment segment,
LinearAllocator<> &allocator,
Vector<IndexMaskSegment, 16> &r_segments)
{
constexpr int64_t range_threshold = 64;
const int64_t offset = segment.offset();
const Span<int16_t> static_indices = get_static_indices_array();
int64_t inverted_index_count = 0;
std::array<int16_t, max_segment_size> inverted_indices_array;
auto add_indices = [&](const int16_t start, const int16_t num) {
int16_t *new_indices_begin = inverted_indices_array.data() + inverted_index_count;
std::iota(new_indices_begin, new_indices_begin + num, start);
inverted_index_count += num;
};
auto finish_indices = [&]() {
Review

Doing this logarithmic range-size-search for potentially every index is not efficient. It may be possible to improve performance of find_size_of_next_range for small ranges.

Doing this logarithmic range-size-search for potentially every index is not efficient. It may be possible to improve performance of `find_size_of_next_range` for small ranges.
Review

I did some experimenting with this and ended up specializing it for single indices. I'm sure there are more possibilities here for the future too!

I did some experimenting with this and ended up specializing it for single indices. I'm sure there are more possibilities here for the future too!
if (inverted_index_count == 0) {
return;
}
MutableSpan<int16_t> offset_indices = allocator.allocate_array<int16_t>(inverted_index_count);
offset_indices.copy_from(Span(inverted_indices_array).take_front(inverted_index_count));
r_segments.append_as(offset, offset_indices);
inverted_index_count = 0;
};
Span<int16_t> indices = segment.base_span();
while (indices.size() > 1) {
const int64_t size_before_gap = get_size_before_gap(indices);
if (size_before_gap == indices.size()) {
break;
HooglyBoogly marked this conversation as resolved
Review

Add indices "at once" instead of one by one. Essentially increasing inverted_indices_count only once.

Add indices "at once" instead of one by one. Essentially increasing ` inverted_indices_count` only once.
Review

This didn't seem to change the performance, but I did it anyway just in case, it is a bit clearer

This didn't seem to change the performance, but I did it anyway just in case, it is a bit clearer
}
const int16_t gap_first = indices[size_before_gap - 1] + 1;
const int16_t next = indices[size_before_gap];
const int16_t gap_size = next - gap_first;
if (gap_size > range_threshold) {
finish_indices();
r_segments.append_as(offset + gap_first, static_indices.take_front(gap_size));
}
else {
add_indices(gap_first, gap_size);
}
indices = indices.drop_front(size_before_gap);
}
finish_indices();
}
static void invert_segments(const IndexMask &mask,
const IndexRange segment_range,
LinearAllocator<> &allocator,
Vector<IndexMaskSegment, 16> &r_segments)
{
for (const int64_t segment_i : segment_range) {
const IndexMaskSegment segment = mask.segment(segment_i);
inverted_indices_to_segments(segment, allocator, r_segments);
const IndexMaskSegment next_segment = mask.segment(segment_i + 1);
const int64_t between_start = segment.last() + 1;
const int64_t size_between_segments = next_segment[0] - segment.last() - 1;
const IndexRange range_between_segments(between_start, size_between_segments);
if (!range_between_segments.is_empty()) {
range_to_segments(range_between_segments, r_segments);
}
}
}
IndexMask IndexMask::complement(const IndexRange universe, IndexMaskMemory &memory) const
{
if (this->is_empty()) {
return universe;
}
if (universe.is_empty()) {
return {};
}
const std::optional<IndexRange> this_range = this->to_range();
if (this_range) {
const bool first_in_range = this_range->first() <= universe.first();
const bool last_in_range = this_range->last() >= universe.last();
if (first_in_range && last_in_range) {
/* This mask fills the entire universe, so the complement is empty. */
return {};
}
if (first_in_range) {
/* This mask is a range that contains the start of the universe.
* The complement is a range that contains the end of the universe. */
const int64_t complement_start = this_range->one_after_last();
const int64_t complement_size = universe.one_after_last() - complement_start;
return IndexRange(complement_start, complement_size);
}
if (last_in_range) {
/* This mask is a range that contains the end of the universe.
* The complement is a range that contains the start of the universe. */
const int64_t complement_start = universe.first();
const int64_t complement_size = this_range->first() - complement_start;
return IndexRange(complement_start, complement_size);
}
}
Vector<IndexMaskSegment, 16> segments;
if (universe.start() < this->first()) {
range_to_segments(universe.take_front(this->first() - universe.start()), segments);
}
if (!this_range) {
const int64_t segments_num = this->segments_num();
constexpr int64_t min_grain_size = 16;
constexpr int64_t max_grain_size = 4096;
const int64_t threads_num = BLI_system_thread_count();
const int64_t grain_size = std::clamp(
segments_num / threads_num, min_grain_size, max_grain_size);
const IndexRange non_last_segments = IndexRange(segments_num).drop_back(1);
if (segments_num < min_grain_size) {
invert_segments(*this, non_last_segments, memory, segments);
}
else {
ParallelSegmentsCollector segments_collector;
threading::parallel_for(non_last_segments, grain_size, [&](const IndexRange range) {
ParallelSegmentsCollector::LocalData &local_data =
segments_collector.data_by_thread.local();
invert_segments(*this, range, local_data.allocator, local_data.segments);
});
segments_collector.reduce(memory, segments);
}
inverted_indices_to_segments(this->segment(segments_num - 1), memory, segments);
}
if (universe.last() > this->first()) {
range_to_segments(universe.take_back(universe.last() - this->last()), segments);
}
return mask_from_segments(segments, memory);
}
template<typename T>
IndexMask IndexMask::from_indices(const Span<T> indices, IndexMaskMemory &memory)
{

View File

@ -223,4 +223,63 @@ TEST(index_mask, FromPredicateFuzzy)
});
}
TEST(index_mask, Complement)
{
IndexMaskMemory memory;
{
const IndexMask mask(0);
const IndexMask complement = mask.complement(IndexRange(100), memory);
EXPECT_EQ(100 - mask.size(), complement.size());
complement.foreach_index([&](const int64_t i) { EXPECT_FALSE(mask.contains(i)); });
mask.foreach_index([&](const int64_t i) { EXPECT_FALSE(complement.contains(i)); });
}
{
const IndexMask mask(10000);
const IndexMask complement = mask.complement(IndexRange(10000), memory);
EXPECT_EQ(10000 - mask.size(), complement.size());
complement.foreach_index([&](const int64_t i) { EXPECT_FALSE(mask.contains(i)); });
mask.foreach_index([&](const int64_t i) { EXPECT_FALSE(complement.contains(i)); });
}
{
const IndexMask mask(IndexRange(100, 900));
const IndexMask complement = mask.complement(IndexRange(1000), memory);
EXPECT_EQ(1000 - mask.size(), complement.size());
complement.foreach_index([&](const int64_t i) { EXPECT_FALSE(mask.contains(i)); });
mask.foreach_index([&](const int64_t i) { EXPECT_FALSE(complement.contains(i)); });
}
{
const IndexMask mask(IndexRange(0, 900));
const IndexMask complement = mask.complement(IndexRange(1000), memory);
EXPECT_EQ(1000 - mask.size(), complement.size());
complement.foreach_index([&](const int64_t i) { EXPECT_FALSE(mask.contains(i)); });
mask.foreach_index([&](const int64_t i) { EXPECT_FALSE(complement.contains(i)); });
}
}
TEST(index_mask, ComplementFuzzy)
{
RandomNumberGenerator rng;
const int64_t mask_size = 100;
const int64_t iter_num = 100;
const int64_t universe_size = 110;
for (const int64_t iter : IndexRange(iter_num)) {
Set<int> values;
for ([[maybe_unused]] const int64_t _ : IndexRange(iter)) {
values.add(rng.get_int32(mask_size));
}
IndexMaskMemory memory;
const IndexMask mask = IndexMask::from_predicate(
IndexRange(mask_size), GrainSize(1024), memory, [&](const int64_t i) {
return values.contains(int(i));
});
const IndexMask complement = mask.complement(IndexRange(universe_size), memory);
EXPECT_EQ(universe_size - mask.size(), complement.size());
complement.foreach_index([&](const int64_t i) { EXPECT_FALSE(mask.contains(i)); });
mask.foreach_index([&](const int64_t i) { EXPECT_FALSE(complement.contains(i)); });
}
}
} // namespace blender::index_mask::tests