AK: Search CircularBuffer matches from the end of memory

This is generally advantageous for compression algorithms.
This commit is contained in:
Tim Schumacher
2025-12-31 13:32:08 +01:00
parent 7d291c4657
commit 13c720c206
2 changed files with 12 additions and 16 deletions

View File

@@ -410,19 +410,19 @@ Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in
// Try a plain memory search for smaller values.
// Note: This overlaps with the hash search for chunks of size HASH_CHUNK_SIZE for the purpose of validation.
if (minimum_length <= HASH_CHUNK_SIZE) {
size_t haystack_offset_from_start = 0;
size_t haystack_offset_from_end = 0;
Vector<ReadonlyBytes, 2> haystack;
// Note: memmem_reverse expects memory chunks in the order that it should search in,
// so haystack[0] needs to be the memory with the highest match priority.
haystack.append(next_search_span(search_limit()));
if (haystack[0].size() < search_limit())
haystack.append(next_search_span(search_limit() - haystack[0].size()));
haystack.prepend(next_search_span(search_limit() - haystack[0].size()));
// TODO: `memmem` searches the memory in "natural" order, which means that it finds matches with a greater distance first.
// Hash-based searching finds the shortest distances first, which is most likely better for encoding and memory efficiency.
// Look into creating a `memmem_reverse`, which starts searching from the end.
auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
auto memmem_match = AK::memmem_reverse(haystack.begin(), haystack.end(), needle);
while (memmem_match.has_value()) {
auto match_offset = memmem_match.release_value();
auto corrected_match_distance = search_limit() - haystack_offset_from_start - match_offset;
auto corrected_match_distance = haystack_offset_from_end + match_offset;
// Validate the match through the set-distance-based implementation and extend it to the largest size possible.
auto maybe_new_match = find_copy_in_seekback(Array { corrected_match_distance }, min(maximum_length, HASH_CHUNK_SIZE), minimum_length);
@@ -445,13 +445,13 @@ Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in
// Trim away the already processed bytes from the haystack.
// Running out of haystack to discard is fine, in this case we found a match at the largest
// distance and therefore tried to advance past that.
haystack_offset_from_start += size_to_discard;
haystack_offset_from_end += size_to_discard;
while (size_to_discard > 0 && haystack.size() > 0) {
if (haystack[0].size() <= size_to_discard) {
size_to_discard -= haystack[0].size();
haystack.remove(0);
} else {
haystack[0] = haystack[0].slice(size_to_discard);
haystack[0] = haystack[0].slice(0, haystack[0].size() - size_to_discard);
break;
}
}
@@ -460,7 +460,7 @@ Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in
break;
// Try and find the next match.
memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
memmem_match = AK::memmem_reverse(haystack.begin(), haystack.end(), needle);
}
// If we found a match of size HASH_CHUNK_SIZE, we should have already found that using the hash search. Investigate.

View File

@@ -342,15 +342,11 @@ TEST_CASE(find_copy_in_seekback)
auto written_needle_bytes = buffer.write(needle);
VERIFY(written_needle_bytes == needle.size());
// Note: As of now, the preference during a tie is determined by which algorithm found the match.
// Hash-based matching finds the shortest distance first, while memmem finds the greatest distance first.
// A matching TODO can be found in CircularBuffer.cpp.
{
// Find the largest match with a length between 1 and 1 (all "A").
auto match = buffer.find_copy_in_seekback(1, 1);
EXPECT(match.has_value());
EXPECT_EQ(match.value().distance, 11ul);
EXPECT_EQ(match.value().distance, 2ul);
EXPECT_EQ(match.value().length, 1ul);
}
@@ -358,7 +354,7 @@ TEST_CASE(find_copy_in_seekback)
// Find the largest match with a length between 1 and 2 (all "AB", everything smaller gets eliminated).
auto match = buffer.find_copy_in_seekback(2, 1);
EXPECT(match.has_value());
EXPECT_EQ(match.value().distance, 11ul);
EXPECT_EQ(match.value().distance, 2ul);
EXPECT_EQ(match.value().length, 2ul);
}