diff options
Diffstat (limited to 'vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp')
-rw-r--r-- | vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp | 91 |
1 files changed, 81 insertions, 10 deletions
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp index 69b34ece2c7..dfd1b3b0c3b 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -24,12 +24,13 @@ using CasingAndDfaType = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaT namespace { std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold, - LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type, + LevenshteinDfa::Matching matching) { - auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type); + auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type, matching); auto maybe_match_lhs = dfa_lhs.match(right); - auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type); + auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type, matching); auto maybe_match_rhs = dfa_rhs.match(left); EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches()); @@ -40,6 +41,12 @@ std::optional<uint32_t> do_calculate(std::string_view left, std::string_view rig return std::nullopt; } +std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold, + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) +{ + return do_calculate(left, right, threshold, casing, dfa_type, LevenshteinDfa::Matching::FullString); +} + std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold, LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) { @@ -74,6 +81,16 @@ struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> { return do_calculate(left, right, threshold, casing(), dfa_type()); } + static std::optional<uint32_t> prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) { + // Prefix matching is not symmetric, cannot use do_calculate() as it implicitly checks this. + auto dfa = LevenshteinDfa::build(left, threshold, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix); + auto maybe_match = dfa.match(right); + if (maybe_match.matches()) { + return {maybe_match.edits()}; + } + return std::nullopt; + } + }; // All the baseline DFA tests use lowercase only, so they should have the exact same outcome @@ -98,6 +115,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { EXPECT_EQ(calculate("abc", "ab", max), std::optional{1}) << max; EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max; EXPECT_EQ(calculate("a", "", max), std::optional{1}) << max; + EXPECT_EQ(calculate("", "", max), std::optional{0}) << max; } EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2}); EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2}); @@ -109,6 +127,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); EXPECT_EQ(calculate("abc", "", 2), std::nullopt); EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt); } TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { @@ -124,6 +143,37 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt); } +TEST_P(LevenshteinDfaTest, can_match_in_target_string_prefix_mode) { + for (auto max : {1, 2}) { + EXPECT_EQ(prefix_calculate("", "literally anything", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("", "", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("x", "", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abc", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcd", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcdef", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "ab", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("ac", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acd", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("bc", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acb", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acb", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abd", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbc", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xy", max), std::nullopt); + } + EXPECT_EQ(prefix_calculate("abc", "xxabc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt); +} + void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor, std::string_view successor_prefix) { @@ -182,11 +232,22 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings "\xf0\x9f\xa4\xa9""food"); // <starry eyed emoji>food // Note that as a general rule, emojis are fickle beasts to deal with since a single - // emoji often takes up multiple code points, which we consider separate characters - // but a user sees as a single actual rendered glyph. + // emoji often takes up multiple code points. We consider these as separate characters + // but from a Unicode perspective they are considered part of an "extended grapheme + // cluster", from which the actually rendered glyph is derived. // Multi-code point character edit distance support is left as an exercise for the reader :D } +TEST_P(LevenshteinDfaTest, can_generate_successors_with_prefix_match_mode) { + auto dfa = LevenshteinDfa::build("ban", 1, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix); + // There is no difference at all in successor output when using Prefix mode as opposed to + // FullString mode, but since we match _more_ source strings in prefix mode we cannot + // simply run the same test set for both modes. This just tests that the wiring seems fine. + test_dfa_successor(dfa, "", "\x01""an"); + test_dfa_successor(dfa, "xy", "yan"); + test_dfa_successor(dfa, "bolle", "bon"); +} + TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) { auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); // The successor string must be lexicographically larger than the input string. @@ -332,13 +393,21 @@ std::string bits_to_str(T v) { return ret; } -using CasingAndDfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType, uint32_t>; +using CasingAndDfaTypeAndMaxEdits = std::tuple< + LevenshteinDfa::Casing, + LevenshteinDfa::DfaType, + LevenshteinDfa::Matching, + uint32_t +>; struct LevenshteinDfaSuccessorTest : TestWithParam<CasingAndDfaTypeAndMaxEdits> { - // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number. + // Print test suffix as e.g. "/Uncased_Explicit_FullString_1" instead of just a GTest-chosen number. static std::string stringify_params(const TestParamInfo<ParamType>& info) { std::ostringstream ss; - ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param); + ss << std::get<0>(info.param) + << '_' << std::get<1>(info.param) + << '_' << std::get<2>(info.param) + << '_' << std::get<3>(info.param); return ss.str(); } }; @@ -350,6 +419,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, Values(LevenshteinDfa::DfaType::Explicit, LevenshteinDfa::DfaType::Implicit, LevenshteinDfa::DfaType::Table), + Values(LevenshteinDfa::Matching::FullString, + LevenshteinDfa::Matching::Prefix), Values(1, 2)), LevenshteinDfaSuccessorTest::stringify_params); @@ -369,10 +440,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, * Inspired by approach used by Lucene DFA exhaustive testing. */ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) { - const auto [casing, dfa_type, max_edits] = GetParam(); + const auto [casing, dfa_type, matching, max_edits] = GetParam(); for (uint32_t i = 0; i < 256; ++i) { const auto target = bits_to_str(static_cast<uint8_t>(i)); - auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type); + auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type, matching); std::string skip_to, successor; for (uint32_t j = 0; j < 256; ++j) { const auto source = bits_to_str(static_cast<uint8_t>(j)); |