From 0e8aab2c447e0ed5cac035d385c95ad44d8594d6 Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Wed, 3 Apr 2024 09:39:31 +0000 Subject: Add prefix match support to Levenshtein algorithm implementations Adds support for matching the _prefix_ of a source string against a target string (the prefix query) within a bounded maximum number of `k` max edits. Iff the number of edits required is within the specified bound, returns the _minimum_ number of edits required to transform the source string prefix to the full target string. By convention, we treat the target string as the "columnar" string in the Levenshtein matrix (i.e. its characters are column-indexed, whereas the source string is row-indexed). This matters for prefix matching, because unlike regular Levenshtein matching it is _not_ symmetric between source and target strings. By definition, the Levenshtein matrix cell at row `i`, column `j` provides the minimum number of edits required to transform a prefix of source string S (up to and including length `i`) into a prefix of target string T (up to and including length `j`). Since we want to match against the entire target (prefix query) string of length `n`, the problem is reduced to finding the minimum value of the `n`th column that is `<= k`. Example: matching the source string `abcdef` against the target `acd` with `k` == 2: a c d 0 1 2 3 a 1 0 1 2 b 2 1 1 2 c 3 2 1 2 d 4 3 2 1 e 5 4 3 2 f 6 5 4 3 In this case, the _shortest_ matching prefix is simply `a`, but the _minimum edits_ prefix is `abcd`. The latter prefix's distance is what we return. For our generalized (i.e. arbitrary `k`) Levenshtein implementation, this is implemented fairly straight forward since it operates on matrix rows already (sparsely populated around the diagonal). For the DFA implementation(s), transitioning between states in a Levenshtein DFA is equivalent to explicitly computing the (sparse) matrix columns around the diagonal for the source character being currently matched, so the same principle applies directly to it. Since we don't have any explicit notion of the value of matrix columns in the abstract DFA, a source string in prefix mode will be considered a match when _any_ DFA state is a match. By definition, this is as-if the matrix row represented by the state has a `n`th column that is <= `k`. We then follow the DFA until it can no longer match, keeping track of the lowest state edit distance encountered. This mirrors finding the row whose `n`th column minimizes `k`. Prefix matching support has been added to the core DFA matching loop algorithms, with only very minor changes to the underlying DFA implementations. Successor generation upon mismatch should work as expected with no algorithmic changes introduced for prefix matching. Prefix match mode has been added as a dimension to the exhaustive successor checking unit test. --- vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp | 91 ++++++++++++++++++++--- 1 file changed, 81 insertions(+), 10 deletions(-) (limited to 'vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp') diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp index 69b34ece2c7..dfd1b3b0c3b 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -24,12 +24,13 @@ using CasingAndDfaType = std::tuple do_calculate(std::string_view left, std::string_view right, uint32_t threshold, - LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type, + LevenshteinDfa::Matching matching) { - auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type); + auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type, matching); auto maybe_match_lhs = dfa_lhs.match(right); - auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type); + auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type, matching); auto maybe_match_rhs = dfa_rhs.match(left); EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches()); @@ -40,6 +41,12 @@ std::optional do_calculate(std::string_view left, std::string_view rig return std::nullopt; } +std::optional do_calculate(std::string_view left, std::string_view right, uint32_t threshold, + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) +{ + return do_calculate(left, right, threshold, casing, dfa_type, LevenshteinDfa::Matching::FullString); +} + std::optional do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold, LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) { @@ -74,6 +81,16 @@ struct LevenshteinDfaTest : TestWithParam { return do_calculate(left, right, threshold, casing(), dfa_type()); } + static std::optional prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) { + // Prefix matching is not symmetric, cannot use do_calculate() as it implicitly checks this. + auto dfa = LevenshteinDfa::build(left, threshold, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix); + auto maybe_match = dfa.match(right); + if (maybe_match.matches()) { + return {maybe_match.edits()}; + } + return std::nullopt; + } + }; // All the baseline DFA tests use lowercase only, so they should have the exact same outcome @@ -98,6 +115,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { EXPECT_EQ(calculate("abc", "ab", max), std::optional{1}) << max; EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max; EXPECT_EQ(calculate("a", "", max), std::optional{1}) << max; + EXPECT_EQ(calculate("", "", max), std::optional{0}) << max; } EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2}); EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2}); @@ -109,6 +127,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); EXPECT_EQ(calculate("abc", "", 2), std::nullopt); EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt); } TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { @@ -124,6 +143,37 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt); } +TEST_P(LevenshteinDfaTest, can_match_in_target_string_prefix_mode) { + for (auto max : {1, 2}) { + EXPECT_EQ(prefix_calculate("", "literally anything", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("", "", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("x", "", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abc", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcd", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcdef", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "ab", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("ac", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acd", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("bc", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acb", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acb", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abd", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbc", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xy", max), std::nullopt); + } + EXPECT_EQ(prefix_calculate("abc", "xxabc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt); +} + void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor, std::string_view successor_prefix) { @@ -182,11 +232,22 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings "\xf0\x9f\xa4\xa9""food"); // food // Note that as a general rule, emojis are fickle beasts to deal with since a single - // emoji often takes up multiple code points, which we consider separate characters - // but a user sees as a single actual rendered glyph. + // emoji often takes up multiple code points. We consider these as separate characters + // but from a Unicode perspective they are considered part of an "extended grapheme + // cluster", from which the actually rendered glyph is derived. // Multi-code point character edit distance support is left as an exercise for the reader :D } +TEST_P(LevenshteinDfaTest, can_generate_successors_with_prefix_match_mode) { + auto dfa = LevenshteinDfa::build("ban", 1, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix); + // There is no difference at all in successor output when using Prefix mode as opposed to + // FullString mode, but since we match _more_ source strings in prefix mode we cannot + // simply run the same test set for both modes. This just tests that the wiring seems fine. + test_dfa_successor(dfa, "", "\x01""an"); + test_dfa_successor(dfa, "xy", "yan"); + test_dfa_successor(dfa, "bolle", "bon"); +} + TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) { auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); // The successor string must be lexicographically larger than the input string. @@ -332,13 +393,21 @@ std::string bits_to_str(T v) { return ret; } -using CasingAndDfaTypeAndMaxEdits = std::tuple; +using CasingAndDfaTypeAndMaxEdits = std::tuple< + LevenshteinDfa::Casing, + LevenshteinDfa::DfaType, + LevenshteinDfa::Matching, + uint32_t +>; struct LevenshteinDfaSuccessorTest : TestWithParam { - // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number. + // Print test suffix as e.g. "/Uncased_Explicit_FullString_1" instead of just a GTest-chosen number. static std::string stringify_params(const TestParamInfo& info) { std::ostringstream ss; - ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param); + ss << std::get<0>(info.param) + << '_' << std::get<1>(info.param) + << '_' << std::get<2>(info.param) + << '_' << std::get<3>(info.param); return ss.str(); } }; @@ -350,6 +419,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, Values(LevenshteinDfa::DfaType::Explicit, LevenshteinDfa::DfaType::Implicit, LevenshteinDfa::DfaType::Table), + Values(LevenshteinDfa::Matching::FullString, + LevenshteinDfa::Matching::Prefix), Values(1, 2)), LevenshteinDfaSuccessorTest::stringify_params); @@ -369,10 +440,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, * Inspired by approach used by Lucene DFA exhaustive testing. */ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) { - const auto [casing, dfa_type, max_edits] = GetParam(); + const auto [casing, dfa_type, matching, max_edits] = GetParam(); for (uint32_t i = 0; i < 256; ++i) { const auto target = bits_to_str(static_cast(i)); - auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type); + auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type, matching); std::string skip_to, successor; for (uint32_t j = 0; j < 256; ++j) { const auto source = bits_to_str(static_cast(j)); -- cgit v1.2.3