summaryrefslogtreecommitdiffstats
path: root/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp')
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp222
1 files changed, 165 insertions, 57 deletions
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index 6966fd0b703..c466d0bfb3e 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -1,10 +1,10 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
#include <vespa/vespalib/fuzzy/dfa_stepping_base.h>
-#include <vespa/vespalib/fuzzy/unicode_utils.h>
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
#include <vespa/vespalib/fuzzy/levenshtein_distance.h> // For benchmarking purposes
+#include <vespa/vespalib/fuzzy/unicode_utils.h>
+#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/util/benchmark_timer.h>
-#include <charconv>
#include <concepts>
#include <filesystem>
#include <fstream>
@@ -18,38 +18,67 @@ namespace fs = std::filesystem;
static std::string benchmark_dictionary;
-struct LevenshteinDfaTest : TestWithParam<LevenshteinDfa::DfaType> {
+using CasingAndDfaType = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType>;
- static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); }
+namespace {
- static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
- auto dfa_lhs = LevenshteinDfa::build(left, threshold, dfa_type());
- auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
+std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold,
+ LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+{
+ auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type);
+ auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
- auto dfa_rhs = LevenshteinDfa::build(right, threshold, dfa_type());
- auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
+ auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type);
+ auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
- EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
- if (maybe_match_lhs.matches()) {
- EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits());
- return {maybe_match_lhs.edits()};
- }
- return std::nullopt;
+ EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
+ if (maybe_match_lhs.matches()) {
+ EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits());
+ return {maybe_match_lhs.edits()};
+ }
+ return std::nullopt;
+}
+
+std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold,
+ LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+{
+ std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size());
+ std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size());
+ return do_calculate(lhs_ch, rhs_ch, threshold, casing, dfa_type);
+}
+
+}
+
+struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> {
+
+ static LevenshteinDfa::Casing casing() noexcept { return std::get<0>(GetParam()); }
+ static LevenshteinDfa::DfaType dfa_type() noexcept { return std::get<1>(GetParam()); }
+
+ static std::string stringify_params(const TestParamInfo<ParamType>& info) {
+ std::ostringstream ss;
+ ss << std::get<0>(info.param) << '_' << std::get<1>(info.param);
+ return ss.str();
+ }
+
+ static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+ return do_calculate(left, right, threshold, casing(), dfa_type());
}
static std::optional<uint32_t> calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold) {
- std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size());
- std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size());
- return calculate(lhs_ch, rhs_ch, threshold);
+ return do_calculate(left, right, threshold, casing(), dfa_type());
}
};
-INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+// All the baseline DFA tests use lowercase only, so they should have the exact same outcome
+// for both cased and uncased matching.
+INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes,
LevenshteinDfaTest,
- Values(LevenshteinDfa::DfaType::Explicit,
- LevenshteinDfa::DfaType::Implicit),
- PrintToStringParamName());
+ Combine(Values(LevenshteinDfa::Casing::Uncased,
+ LevenshteinDfa::Casing::Cased),
+ Values(LevenshteinDfa::DfaType::Explicit,
+ LevenshteinDfa::DfaType::Implicit)),
+ LevenshteinDfaTest::stringify_params);
// Same as existing non-DFA Levenshtein tests, but with some added instantiations
// for smaller max distances.
@@ -101,7 +130,7 @@ void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std:
}
TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) {
- auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+ auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
test_dfa_successor(dfa, "", "\x01""food");
test_dfa_successor(dfa, "faa", "faod");
@@ -122,8 +151,8 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings
// Also works with Unicode
// 2 chars
- test_dfa_successor(dfa, "\xc3\x86""x", // "Æx"
- "\xc3\x87""food"); // "Çfood"
+ test_dfa_successor(dfa, "\xc3\xa6""x", // "æx"
+ "\xc3\xa7""food"); // "çfood"
// 3 chars
test_dfa_successor(dfa, "\xe7\x8c\xab""\xe3\x81\xaf", // "猫は"
"\xe7\x8c\xac""food"); // "猬food"
@@ -138,7 +167,7 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings
}
TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) {
- auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+ auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
// The successor string must be lexicographically larger than the input string.
// In the presence of a wildcard output edge we handle this by increase the input
// character by 1 and encoding it back as UTF-8.
@@ -155,7 +184,7 @@ TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_
}
TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) {
- auto dfa = LevenshteinDfa::build("", 1, dfa_type());
+ auto dfa = LevenshteinDfa::build("", 1, casing(), dfa_type());
test_dfa_successor(dfa, "aa", "b");
test_dfa_successor(dfa, "b\x01", "c");
test_dfa_successor(dfa, "vespa", "w");
@@ -170,12 +199,80 @@ TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) {
EXPECT_EQ(calculate("\xff\xff", "a", 2), std::optional{2});
EXPECT_EQ(calculate("a", "\xff", 2), std::optional{1});
EXPECT_EQ(calculate("a", "\xff\xff\xff", 2), std::nullopt);
- EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0});
+ EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0});
}
TEST_P(LevenshteinDfaTest, unsupported_max_edits_value_throws) {
- EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, dfa_type()), std::invalid_argument);
- EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, dfa_type()), std::invalid_argument);
+ EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, casing(), dfa_type()), std::invalid_argument);
+ EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, casing(), dfa_type()), std::invalid_argument);
+}
+
+struct LevenshteinDfaCasingTest : TestWithParam<LevenshteinDfa::DfaType> {
+ static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); }
+
+ static std::optional<uint32_t> calculate_cased(std::string_view left, std::string_view right, uint32_t threshold) {
+ return do_calculate(left, right, threshold, LevenshteinDfa::Casing::Cased, dfa_type());
+ }
+
+ static std::optional<uint32_t> calculate_uncased(std::string_view left, std::string_view right, uint32_t threshold) {
+ return do_calculate(left, right, threshold, LevenshteinDfa::Casing::Uncased, dfa_type());
+ }
+};
+
+INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+ LevenshteinDfaCasingTest,
+ Values(LevenshteinDfa::DfaType::Explicit,
+ LevenshteinDfa::DfaType::Implicit),
+ PrintToStringParamName());
+
+TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) {
+ for (auto max : {1, 2}) {
+ EXPECT_EQ(calculate_uncased("abc", "ABC", max), std::optional{0}) << max;
+ EXPECT_EQ(calculate_uncased("Abc", "aB1", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_uncased("aBC", "1bc", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_uncased("Abc", "a1C", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_uncased("aBc", "AB", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_uncased("ABC", "abcd", max), std::optional{1}) << max;
+ }
+ EXPECT_EQ(calculate_uncased("bc", "aBCd", 2), std::optional{2});
+ EXPECT_EQ(calculate_uncased("ab", "AbCd", 2), std::optional{2});
+ EXPECT_EQ(calculate_uncased("CD", "AbcD", 2), std::optional{2});
+ EXPECT_EQ(calculate_uncased("ad", "AbcD", 2), std::optional{2});
+}
+
+TEST_P(LevenshteinDfaCasingTest, cased_edge_cases_have_correct_edit_distance) {
+ for (auto max : {1, 2}) {
+ EXPECT_EQ(calculate_cased("abc", "abC", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_cased("Abc", "aB1", max), std::nullopt) << max;
+ EXPECT_EQ(calculate_cased("aBC", "1bc", max), std::nullopt) << max;
+ EXPECT_EQ(calculate_cased("Abc", "a1C", max), std::nullopt) << max;
+ EXPECT_EQ(calculate_cased("ABC", "abcd", max), std::nullopt) << max;
+ }
+ EXPECT_EQ(calculate_cased("abc", "ABC", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("abc", "aBC", 2), std::optional{2});
+ EXPECT_EQ(calculate_cased("bc", "aBCd", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("ab", "AbCd", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("CD", "AbcD", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("ad", "AbcD", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("ad", "aBCd", 2), std::optional{2});
+}
+
+TEST_P(LevenshteinDfaCasingTest, uncased_successor_is_emitted_as_if_match_term_was_lowercased) {
+ auto dfa = LevenshteinDfa::build("FOOD", 1, LevenshteinDfa::Casing::Uncased, dfa_type());
+ // This is a subset of the other successor test cases
+ test_dfa_successor(dfa, "", "\x01""food");
+ test_dfa_successor(dfa, "FAA", "faod");
+ test_dfa_successor(dfa, "fOoOoO", "foop");
+ test_dfa_successor(dfa, "OOOf", "pfood");
+ test_dfa_successor(dfa, "Fo", "fo\x01""d");
+ test_dfa_successor(dfa, "oO", "ood");
+ test_dfa_successor(dfa, "OOO", "oood");
+ test_dfa_successor(dfa, "FOXX", "foyd");
+ test_dfa_successor(dfa, "GG", "good");
+ test_dfa_successor(dfa, "Gp", "hfood");
+ test_dfa_successor(dfa, "EP", "f\x01""od");
+ test_dfa_successor(dfa, "Hfoodz", "hood");
+ test_dfa_successor(dfa, "Aooodz", "bfood");
}
// Turn integer v into its bitwise string representation with the MSB as the leftmost character.
@@ -191,20 +288,22 @@ std::string bits_to_str(T v) {
return ret;
}
-using DfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::DfaType, uint32_t>;
+using CasingAndDfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType, uint32_t>;
-struct LevenshteinDfaSuccessorTest : TestWithParam<DfaTypeAndMaxEdits> {
- // Print test suffix as e.g. "/Explicit_1" instead of just a GTest-chosen number.
+struct LevenshteinDfaSuccessorTest : TestWithParam<CasingAndDfaTypeAndMaxEdits> {
+ // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number.
static std::string stringify_params(const TestParamInfo<ParamType>& info) {
std::ostringstream ss;
- ss << std::get<0>(info.param) << '_' << std::get<1>(info.param);
+ ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param);
return ss.str();
}
};
INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
LevenshteinDfaSuccessorTest,
- Combine(Values(LevenshteinDfa::DfaType::Explicit,
+ Combine(Values(LevenshteinDfa::Casing::Uncased,
+ LevenshteinDfa::Casing::Cased),
+ Values(LevenshteinDfa::DfaType::Explicit,
LevenshteinDfa::DfaType::Implicit),
Values(1, 2)),
LevenshteinDfaSuccessorTest::stringify_params);
@@ -225,10 +324,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
* Inspired by approach used by Lucene DFA exhaustive testing.
*/
TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
- const auto [dfa_type, max_edits] = GetParam();
+ const auto [casing, dfa_type, max_edits] = GetParam();
for (uint32_t i = 0; i < 256; ++i) {
const auto target = bits_to_str(static_cast<uint8_t>(i));
- auto target_dfa = LevenshteinDfa::build(target, max_edits, dfa_type);
+ auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type);
std::string skip_to, successor;
for (uint32_t j = 0; j < 256; ++j) {
const auto source = bits_to_str(static_cast<uint8_t>(j));
@@ -298,11 +397,10 @@ enum class BenchmarkType {
};
const char* to_s(BenchmarkType t) noexcept {
- // Note: need underscores since this is used as part of GTest-generated test instance names
switch (t) {
- case BenchmarkType::DfaExplicit: return "DFA_explicit";
- case BenchmarkType::DfaImplicit: return "DFA_implicit";
- case BenchmarkType::Legacy: return "legacy";
+ case BenchmarkType::DfaExplicit: return "DfaExplicit";
+ case BenchmarkType::DfaImplicit: return "DfaImplicit";
+ case BenchmarkType::Legacy: return "Legacy";
}
abort();
}
@@ -315,10 +413,14 @@ const char* to_s(BenchmarkType t) noexcept {
return {2, 8, 16, 64, 256, 1024, 1024*16, 1024*64};
}
-struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
+using BenchmarkTypeAndCasing = std::tuple<BenchmarkType, LevenshteinDfa::Casing>;
+
+struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkTypeAndCasing> {
static std::string stringify_params(const TestParamInfo<ParamType>& info) {
- return to_s(info.param);
+ std::ostringstream ss;
+ ss << to_s(std::get<0>(info.param)) << '_' << std::get<1>(info.param);
+ return ss.str();
}
void SetUp() override {
@@ -327,7 +429,8 @@ struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
}
}
- static BenchmarkType benchmark_type() noexcept { return GetParam(); }
+ static BenchmarkType benchmark_type() noexcept { return std::get<0>(GetParam()); }
+ static LevenshteinDfa::Casing casing() noexcept { return std::get<1>(GetParam()); }
static const std::vector<std::string>& load_dictionary_once() {
static auto sorted_lines = read_and_sort_all_lines(fs::path(benchmark_dictionary));
@@ -351,9 +454,11 @@ struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
LevenshteinBenchmarkTest,
- Values(BenchmarkType::DfaExplicit,
- BenchmarkType::DfaImplicit,
- BenchmarkType::Legacy),
+ Combine(Values(BenchmarkType::DfaExplicit,
+ BenchmarkType::DfaImplicit,
+ BenchmarkType::Legacy),
+ Values(LevenshteinDfa::Casing::Cased,
+ LevenshteinDfa::Casing::Uncased)),
LevenshteinBenchmarkTest::stringify_params);
// ("abc", 1) => "a"
@@ -382,12 +487,12 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_t
// string must be matched and any sparse representation is always maximally filled since
// we never expend any edits via mismatches.
// Also ensure that we have multiple out-edges per node (i.e. don't just repeat "AAA" etc.).
- std::string str = repeated_string("abcde", sz);
+ std::string str = repeated_string("aBcDeFgHiJ", sz);
double min_time_s;
if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
: LevenshteinDfa::DfaType::Implicit;
- auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
min_time_s = BenchmarkTimer::benchmark([&] {
auto res = dfa.match(str, nullptr); // not benchmarking successor generation
do_not_optimize_away(res);
@@ -408,15 +513,16 @@ TEST(LevenshteinExplicitDfaBenchmarkTest, benchmark_explicit_dfa_construction) {
if (!benchmarking_enabled()) {
GTEST_SKIP() << "benchmarking not enabled";
}
+ const auto casing = LevenshteinDfa::Casing::Cased; // For building, casing only affects initial string normalization
using vespalib::BenchmarkTimer;
for (uint8_t k : {1, 2}) {
for (uint32_t sz : string_lengths()) {
- std::string str = repeated_string("abcde", sz);
+ std::string str = repeated_string("aBcDeFgHiJ", sz);
double min_time_s = BenchmarkTimer::benchmark([&] {
- auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+ auto dfa = LevenshteinDfa::build(str, k, casing, LevenshteinDfa::DfaType::Explicit);
do_not_optimize_away(dfa);
}, 2.0);
- auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+ auto dfa = LevenshteinDfa::build(str, k, casing, LevenshteinDfa::DfaType::Explicit);
size_t mem_usage = dfa.memory_usage();
fprintf(stderr, "k=%u, sz=%u: \t%g us \t%zu bytes\n", k, sz, min_time_s * 1000000.0, mem_usage);
}
@@ -431,12 +537,12 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
fprintf(stderr, "------ %s ------\n", to_s(type));
for (uint8_t k : {1, 2}) {
for (uint32_t sz : target_lengths) {
- std::string str = repeated_string("abcde", sz);
+ std::string str = repeated_string("aBcDeFgHiJ", sz);
double min_time_s;
if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
: LevenshteinDfa::DfaType::Implicit;
- auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
min_time_s = BenchmarkTimer::benchmark([&] {
for (const auto& line : dict) {
auto res = dfa.match(line, nullptr);
@@ -447,7 +553,9 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
min_time_s = BenchmarkTimer::benchmark([&] {
auto target_u32 = utf8_string_to_utf32(str);
for (const auto& line : dict) {
- auto line_u32 = utf8_string_to_utf32(line);
+ std::vector<uint32_t> line_u32 = ((casing() == LevenshteinDfa::Casing::Uncased)
+ ? vespalib::LowerCase::convert_to_ucs4(std::string_view(str))
+ : utf8_string_to_utf32(line));
auto res = vespalib::LevenshteinDistance::calculate(line_u32, target_u32, k);
do_not_optimize_away(res);
}
@@ -472,7 +580,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) {
std::string str = repeated_string("abcde", sz);
auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
: LevenshteinDfa::DfaType::Implicit;
- auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
double min_time_s = BenchmarkTimer::benchmark([&] {
auto iter = dict.cbegin();
auto end = dict.cend();