AK: Implement reverse-order memory search

This is particularly useful for compression, where we want to search
through the lookback buffer for the smallest possible distance [towards
the end].
This commit is contained in:
Tim Schumacher
2025-12-31 13:13:37 +01:00
parent 01002f433b
commit 7d291c4657
2 changed files with 123 additions and 0 deletions

View File

@@ -101,6 +101,70 @@ requires(requires { (*haystack_begin).data(); (*haystack_begin).size(); })
return {};
}
template<typename HaystackIterT>
inline Optional<size_t> memmem_reverse(HaystackIterT const& haystack_begin, HaystackIterT const& haystack_end, ReadonlyBytes needle)
requires(requires { (*haystack_begin).data(); (*haystack_begin).size(); })
{
// Note: This is a simple inversion of our modified KMP algorithm that is used in AK::memmem.
// Be aware that we keep the table values mostly positive and the indices refer to the number
// of matched characters.
// In short: We really only invert the array accesses into the needle and haystack.
auto prepare_kmp_partial_table = [&] {
Vector<int, 64> table;
table.try_resize(needle.size()).release_value_but_fixme_should_propagate_errors();
size_t position = 1;
int candidate = 0;
table[0] = -1;
while (position < needle.size()) {
if (needle[needle.size() - 1 - position] == needle[needle.size() - 1 - candidate]) {
table[position] = table[candidate];
} else {
table[position] = candidate;
do {
candidate = table[candidate];
} while (candidate >= 0 && needle[needle.size() - 1 - candidate] != needle[needle.size() - 1 - position]);
}
++position;
++candidate;
}
return table;
};
auto table = prepare_kmp_partial_table();
size_t total_haystack_index = 0;
size_t current_haystack_index = 0;
int needle_index = 0;
auto haystack_it = haystack_begin;
while (haystack_it != haystack_end) {
auto&& chunk = *haystack_it;
if (current_haystack_index >= chunk.size()) {
current_haystack_index = 0;
++haystack_it;
continue;
}
if (needle[needle.size() - 1 - needle_index] == chunk[chunk.size() - 1 - current_haystack_index]) {
++needle_index;
++current_haystack_index;
++total_haystack_index;
if ((size_t)needle_index == needle.size())
return total_haystack_index;
continue;
}
needle_index = table[needle_index];
if (needle_index < 0) {
++needle_index;
++current_haystack_index;
++total_haystack_index;
}
}
return {};
}
inline Optional<size_t> memmem_optional(void const* haystack, size_t haystack_length, void const* needle, size_t needle_length)
{
if (needle_length == 0)

View File

@@ -76,6 +76,65 @@ TEST_CASE(kmp_two_chunks)
EXPECT(!result_3.has_value());
}
TEST_CASE(kmp_reverse_one_chunk)
{
Array<u8, 8> haystack { 1, 0, 1, 2, 3, 4, 5, 0 };
Array<Array<u8, 8>, 1> haystack_arr { haystack };
Array<u8, 4> needle_0 { 2, 3, 4, 5 };
Array<u8, 4> needle_1 { 1, 2, 3, 4 };
Array<u8, 4> needle_2 { 3, 4, 5, 0 };
Array<u8, 4> needle_3 { 3, 4, 5, 6 };
auto result_0 = AK::memmem_reverse(haystack_arr.begin(), haystack_arr.end(), needle_0);
auto result_1 = AK::memmem_reverse(haystack_arr.begin(), haystack_arr.end(), needle_1);
auto result_2 = AK::memmem_reverse(haystack_arr.begin(), haystack_arr.end(), needle_2);
auto result_3 = AK::memmem_reverse(haystack_arr.begin(), haystack_arr.end(), needle_3);
EXPECT_EQ(result_0.value_or(9), 5u);
EXPECT_EQ(result_1.value_or(9), 6u);
EXPECT_EQ(result_2.value_or(9), 4u);
EXPECT(!result_3.has_value());
}
TEST_CASE(kmp_reverse_two_chunks)
{
Array<u8, 4> haystack_first_half { 1, 0, 1, 2 }, haystack_second_half { 3, 4, 5, 0 };
Array<Array<u8, 4>, 2> haystack { haystack_second_half, haystack_first_half };
Array<u8, 4> needle_0 { 2, 3, 4, 5 };
Array<u8, 4> needle_1 { 1, 2, 3, 4 };
Array<u8, 4> needle_2 { 3, 4, 5, 0 };
Array<u8, 4> needle_3 { 3, 4, 5, 6 };
auto result_0 = AK::memmem_reverse(haystack.begin(), haystack.end(), needle_0);
auto result_1 = AK::memmem_reverse(haystack.begin(), haystack.end(), needle_1);
auto result_2 = AK::memmem_reverse(haystack.begin(), haystack.end(), needle_2);
auto result_3 = AK::memmem_reverse(haystack.begin(), haystack.end(), needle_3);
EXPECT_EQ(result_0.value_or(9), 5u);
EXPECT_EQ(result_1.value_or(9), 6u);
EXPECT_EQ(result_2.value_or(9), 4u);
EXPECT(!result_3.has_value());
}
TEST_CASE(kmp_match_order)
{
Array<u8, 4> haystack_first_half { 1, 0, 1, 2 }, haystack_second_half { 3, 4, 5, 0 };
Array<Array<u8, 4>, 2> haystack_f { haystack_first_half, haystack_second_half };
Array<Array<u8, 4>, 2> haystack_b { haystack_second_half, haystack_first_half };
Array<u8, 1> needle_0 { 0 };
auto result_0_f = AK::memmem(haystack_f.begin(), haystack_f.end(), needle_0);
auto result_0_b = AK::memmem_reverse(haystack_b.begin(), haystack_b.end(), needle_0);
EXPECT_EQ(result_0_f.value_or(9), 1u);
EXPECT_EQ(result_0_b.value_or(9), 1u);
Array<u8, 1> needle_1 { 1 };
auto result_1_f = AK::memmem(haystack_f.begin(), haystack_f.end(), needle_1);
auto result_1_b = AK::memmem_reverse(haystack_b.begin(), haystack_b.end(), needle_1);
EXPECT_EQ(result_1_f.value_or(9), 0u);
EXPECT_EQ(result_1_b.value_or(9), 6u);
}
TEST_CASE(timing_safe_compare)
{
ByteString data_set = "abcdefghijklmnopqrstuvwxyz123456789";