From 0e8aab2c447e0ed5cac035d385c95ad44d8594d6 Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Wed, 3 Apr 2024 09:39:31 +0000 Subject: Add prefix match support to Levenshtein algorithm implementations Adds support for matching the _prefix_ of a source string against a target string (the prefix query) within a bounded maximum number of `k` max edits. Iff the number of edits required is within the specified bound, returns the _minimum_ number of edits required to transform the source string prefix to the full target string. By convention, we treat the target string as the "columnar" string in the Levenshtein matrix (i.e. its characters are column-indexed, whereas the source string is row-indexed). This matters for prefix matching, because unlike regular Levenshtein matching it is _not_ symmetric between source and target strings. By definition, the Levenshtein matrix cell at row `i`, column `j` provides the minimum number of edits required to transform a prefix of source string S (up to and including length `i`) into a prefix of target string T (up to and including length `j`). Since we want to match against the entire target (prefix query) string of length `n`, the problem is reduced to finding the minimum value of the `n`th column that is `<= k`. Example: matching the source string `abcdef` against the target `acd` with `k` == 2: a c d 0 1 2 3 a 1 0 1 2 b 2 1 1 2 c 3 2 1 2 d 4 3 2 1 e 5 4 3 2 f 6 5 4 3 In this case, the _shortest_ matching prefix is simply `a`, but the _minimum edits_ prefix is `abcd`. The latter prefix's distance is what we return. For our generalized (i.e. arbitrary `k`) Levenshtein implementation, this is implemented fairly straight forward since it operates on matrix rows already (sparsely populated around the diagonal). For the DFA implementation(s), transitioning between states in a Levenshtein DFA is equivalent to explicitly computing the (sparse) matrix columns around the diagonal for the source character being currently matched, so the same principle applies directly to it. Since we don't have any explicit notion of the value of matrix columns in the abstract DFA, a source string in prefix mode will be considered a match when _any_ DFA state is a match. By definition, this is as-if the matrix row represented by the state has a `n`th column that is <= `k`. We then follow the DFA until it can no longer match, keeping track of the lowest state edit distance encountered. This mirrors finding the row whose `n`th column minimizes `k`. Prefix matching support has been added to the core DFA matching loop algorithms, with only very minor changes to the underlying DFA implementations. Successor generation upon mismatch should work as expected with no algorithmic changes introduced for prefix matching. Prefix match mode has been added as a dimension to the exhaustive successor checking unit test. --- vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp | 91 +++++++++++++++++++--- .../src/tests/fuzzy/levenshtein_distance_test.cpp | 55 +++++++++++++ vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h | 6 ++ .../vespalib/fuzzy/explicit_levenshtein_dfa.h | 12 ++- .../vespalib/fuzzy/explicit_levenshtein_dfa.hpp | 24 +++--- .../vespalib/fuzzy/implicit_levenshtein_dfa.h | 6 +- .../vespalib/fuzzy/implicit_levenshtein_dfa.hpp | 15 ++-- .../src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp | 30 +++++-- .../src/vespa/vespalib/fuzzy/levenshtein_dfa.h | 31 ++++++++ .../vespa/vespalib/fuzzy/levenshtein_distance.cpp | 79 ++++++++++++++----- .../vespa/vespalib/fuzzy/levenshtein_distance.h | 10 ++- .../src/vespa/vespalib/fuzzy/match_algorithm.hpp | 69 +++++++++++++++- vespalib/src/vespa/vespalib/fuzzy/table_dfa.h | 3 +- vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp | 17 ++-- 14 files changed, 379 insertions(+), 69 deletions(-) (limited to 'vespalib') diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp index 69b34ece2c7..dfd1b3b0c3b 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -24,12 +24,13 @@ using CasingAndDfaType = std::tuple do_calculate(std::string_view left, std::string_view right, uint32_t threshold, - LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type, + LevenshteinDfa::Matching matching) { - auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type); + auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type, matching); auto maybe_match_lhs = dfa_lhs.match(right); - auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type); + auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type, matching); auto maybe_match_rhs = dfa_rhs.match(left); EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches()); @@ -40,6 +41,12 @@ std::optional do_calculate(std::string_view left, std::string_view rig return std::nullopt; } +std::optional do_calculate(std::string_view left, std::string_view right, uint32_t threshold, + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) +{ + return do_calculate(left, right, threshold, casing, dfa_type, LevenshteinDfa::Matching::FullString); +} + std::optional do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold, LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) { @@ -74,6 +81,16 @@ struct LevenshteinDfaTest : TestWithParam { return do_calculate(left, right, threshold, casing(), dfa_type()); } + static std::optional prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) { + // Prefix matching is not symmetric, cannot use do_calculate() as it implicitly checks this. + auto dfa = LevenshteinDfa::build(left, threshold, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix); + auto maybe_match = dfa.match(right); + if (maybe_match.matches()) { + return {maybe_match.edits()}; + } + return std::nullopt; + } + }; // All the baseline DFA tests use lowercase only, so they should have the exact same outcome @@ -98,6 +115,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { EXPECT_EQ(calculate("abc", "ab", max), std::optional{1}) << max; EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max; EXPECT_EQ(calculate("a", "", max), std::optional{1}) << max; + EXPECT_EQ(calculate("", "", max), std::optional{0}) << max; } EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2}); EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2}); @@ -109,6 +127,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); EXPECT_EQ(calculate("abc", "", 2), std::nullopt); EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt); } TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { @@ -124,6 +143,37 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt); } +TEST_P(LevenshteinDfaTest, can_match_in_target_string_prefix_mode) { + for (auto max : {1, 2}) { + EXPECT_EQ(prefix_calculate("", "literally anything", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("", "", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("x", "", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abc", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcd", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcdef", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "ab", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("ac", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acd", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("bc", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acb", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acb", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abd", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbc", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xy", max), std::nullopt); + } + EXPECT_EQ(prefix_calculate("abc", "xxabc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt); +} + void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor, std::string_view successor_prefix) { @@ -182,11 +232,22 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings "\xf0\x9f\xa4\xa9""food"); // food // Note that as a general rule, emojis are fickle beasts to deal with since a single - // emoji often takes up multiple code points, which we consider separate characters - // but a user sees as a single actual rendered glyph. + // emoji often takes up multiple code points. We consider these as separate characters + // but from a Unicode perspective they are considered part of an "extended grapheme + // cluster", from which the actually rendered glyph is derived. // Multi-code point character edit distance support is left as an exercise for the reader :D } +TEST_P(LevenshteinDfaTest, can_generate_successors_with_prefix_match_mode) { + auto dfa = LevenshteinDfa::build("ban", 1, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix); + // There is no difference at all in successor output when using Prefix mode as opposed to + // FullString mode, but since we match _more_ source strings in prefix mode we cannot + // simply run the same test set for both modes. This just tests that the wiring seems fine. + test_dfa_successor(dfa, "", "\x01""an"); + test_dfa_successor(dfa, "xy", "yan"); + test_dfa_successor(dfa, "bolle", "bon"); +} + TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) { auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); // The successor string must be lexicographically larger than the input string. @@ -332,13 +393,21 @@ std::string bits_to_str(T v) { return ret; } -using CasingAndDfaTypeAndMaxEdits = std::tuple; +using CasingAndDfaTypeAndMaxEdits = std::tuple< + LevenshteinDfa::Casing, + LevenshteinDfa::DfaType, + LevenshteinDfa::Matching, + uint32_t +>; struct LevenshteinDfaSuccessorTest : TestWithParam { - // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number. + // Print test suffix as e.g. "/Uncased_Explicit_FullString_1" instead of just a GTest-chosen number. static std::string stringify_params(const TestParamInfo& info) { std::ostringstream ss; - ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param); + ss << std::get<0>(info.param) + << '_' << std::get<1>(info.param) + << '_' << std::get<2>(info.param) + << '_' << std::get<3>(info.param); return ss.str(); } }; @@ -350,6 +419,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, Values(LevenshteinDfa::DfaType::Explicit, LevenshteinDfa::DfaType::Implicit, LevenshteinDfa::DfaType::Table), + Values(LevenshteinDfa::Matching::FullString, + LevenshteinDfa::Matching::Prefix), Values(1, 2)), LevenshteinDfaSuccessorTest::stringify_params); @@ -369,10 +440,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, * Inspired by approach used by Lucene DFA exhaustive testing. */ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) { - const auto [casing, dfa_type, max_edits] = GetParam(); + const auto [casing, dfa_type, matching, max_edits] = GetParam(); for (uint32_t i = 0; i < 256; ++i) { const auto target = bits_to_str(static_cast(i)); - auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type); + auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type, matching); std::string skip_to, successor; for (uint32_t j = 0; j < 256; ++j) { const auto source = bits_to_str(static_cast(j)); diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp index 37d1d4b4c54..918cbc624db 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp @@ -16,6 +16,13 @@ std::optional calculate(std::string_view left, std::string_view right, return leftRight; } +// Prefix matching is asymmetric and therefore cannot implicitly test result symmetry +std::optional prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) { + auto left_codepoints = vespalib::LowerCase::convert_to_ucs4(left); + auto right_codepoints = vespalib::LowerCase::convert_to_ucs4(right); + return vespalib::LevenshteinDistance::calculate(left_codepoints, right_codepoints, threshold, true); +} + TEST(LevenshteinDistance, calculate_edgecases) { EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0}); EXPECT_EQ(calculate("abc", "ab1", 2), std::optional{1}); @@ -33,6 +40,54 @@ TEST(LevenshteinDistance, calculate_edgecases) { EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); EXPECT_EQ(calculate("abc", "", 2), std::nullopt); EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt); +} + +TEST(LevenshteinDistance, prefix_match_edge_cases) { + // Same cases as LevenshteinDfaTest (TODO consolidate these somehow) + for (auto max : {1, 2}) { + EXPECT_EQ(prefix_calculate("", "literally anything", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("", "", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("x", "", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abc", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcd", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcdef", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "ab", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("ac", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acd", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("bc", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acb", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acb", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abd", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbc", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xy", max), std::nullopt); + } + EXPECT_EQ(prefix_calculate("abc", "xxabc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt); + + // Max edits > 2 cases; not supported by DFA implementation. + EXPECT_EQ(prefix_calculate("abc", "", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abc", "xy", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abc", "xyz", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abc", "xyzzz", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abcd", "xyzd", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abcd", "xyzz", 3), std::nullopt); + EXPECT_EQ(prefix_calculate("abcd", "", 3), std::nullopt); +} + +TEST(LevenshteinDistance, oversized_max_edits_is_well_defined) { + const auto k = uint32_t(INT32_MAX) + 10000u; + EXPECT_EQ(calculate("abc", "xyz", k), std::optional{3}); + EXPECT_EQ(prefix_calculate("abc", "xyzzzz", k), std::optional{3}); } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h index c69f414aae6..8b2597dd179 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h @@ -20,6 +20,12 @@ concept DfaMatcher = requires(T a, std::string u8str, std::vector u32s // matching to have the expected semantics, the actual target string must be pre-lowercased. { a.is_cased() } -> std::same_as; + // Whether the matching is performed in prefix mode. In prefix mode, a source string is + // considered a match if it matches the target string at any point during DFA traversal. + // I.e. the source string "bananas" prefix-matched against the target (prefix) "ban" will + // be returned as a match with 0 edits. + { a.is_prefix() } -> std::same_as; + // Initial (starting) state of the DFA { a.start() } -> std::same_as; diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h index 490582b5bf7..a3542d61dc8 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h @@ -97,9 +97,11 @@ public: private: std::vector _nodes; const bool _is_cased; + const bool _is_prefix; public: - explicit ExplicitLevenshteinDfaImpl(bool is_cased) noexcept - : _is_cased(is_cased) + ExplicitLevenshteinDfaImpl(bool is_cased, bool is_prefix) noexcept + : _is_cased(is_cased), + _is_prefix(is_prefix) {} ~ExplicitLevenshteinDfaImpl() override; @@ -140,10 +142,12 @@ template class ExplicitLevenshteinDfaBuilder { const std::vector _u32_str_buf; // TODO std::u32string const bool _is_cased; + const bool _is_prefix; public: - ExplicitLevenshteinDfaBuilder(std::vector str, bool is_cased) noexcept + ExplicitLevenshteinDfaBuilder(std::vector str, bool is_cased, bool is_prefix) noexcept : _u32_str_buf(std::move(str)), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) {} [[nodiscard]] LevenshteinDfa build_dfa() const; diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp index 55dd459ff26..11a822d2d7e 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp @@ -22,16 +22,20 @@ struct ExplicitDfaMatcher { const std::span _nodes; const bool _is_cased; + const bool _is_prefix; - ExplicitDfaMatcher(const std::span nodes, bool is_cased) noexcept + ExplicitDfaMatcher(const std::span nodes, bool is_cased, bool is_prefix) noexcept : _nodes(nodes), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) {} static constexpr uint8_t max_edits() noexcept { return MaxEdits; } bool is_cased() const noexcept { return _is_cased; } + bool is_prefix() const noexcept { return _is_prefix; } + StateType start() const noexcept { return &_nodes[0]; } @@ -99,21 +103,21 @@ ExplicitLevenshteinDfaImpl::~ExplicitLevenshteinDfaImpl() = default; template LevenshteinDfa::MatchResult ExplicitLevenshteinDfaImpl::match(std::string_view u8str) const { - ExplicitDfaMatcher matcher(_nodes, _is_cased); + ExplicitDfaMatcher matcher(_nodes, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str); } template LevenshteinDfa::MatchResult ExplicitLevenshteinDfaImpl::match(std::string_view u8str, std::string& successor_out) const { - ExplicitDfaMatcher matcher(_nodes, _is_cased); + ExplicitDfaMatcher matcher(_nodes, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str, successor_out); } template LevenshteinDfa::MatchResult ExplicitLevenshteinDfaImpl::match(std::string_view u8str, std::vector& successor_out) const { - ExplicitDfaMatcher matcher(_nodes, _is_cased); + ExplicitDfaMatcher matcher(_nodes, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str, successor_out); } @@ -199,10 +203,12 @@ class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase { using Base::transitions; const bool _is_cased; + const bool _is_prefix; public: - ExplicitLevenshteinDfaBuilderImpl(std::span str, bool is_cased) noexcept + ExplicitLevenshteinDfaBuilderImpl(std::span str, bool is_cased, bool is_prefix) noexcept : DfaSteppingBase(str), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) { assert(str.size() < UINT32_MAX / max_out_edges_per_node()); } @@ -217,7 +223,7 @@ public: template LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl::build_dfa() const { - auto dfa = std::make_unique>(_is_cased); + auto dfa = std::make_unique>(_is_cased, _is_prefix); ExploreState exp; // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_ // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth. @@ -257,7 +263,7 @@ LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl::build_dfa() const { template LevenshteinDfa ExplicitLevenshteinDfaBuilder::build_dfa() const { - ExplicitLevenshteinDfaBuilderImpl builder(_u32_str_buf, _is_cased); + ExplicitLevenshteinDfaBuilderImpl builder(_u32_str_buf, _is_cased, _is_prefix); return builder.build_dfa(); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h index 20c8e1b7e2b..552fd849734 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h @@ -13,14 +13,16 @@ class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl { std::string _target_as_utf8; std::vector _target_utf8_char_offsets; const bool _is_cased; + const bool _is_prefix; public: using MatchResult = LevenshteinDfa::MatchResult; - ImplicitLevenshteinDfa(std::vector str, bool is_cased) + ImplicitLevenshteinDfa(std::vector str, bool is_cased, bool is_prefix) : _u32_str_buf(std::move(str)), _target_as_utf8(), _target_utf8_char_offsets(), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) { precompute_utf8_target_with_offsets(); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp index 05ef2761f34..7dadcc59b8e 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp @@ -32,21 +32,26 @@ struct ImplicitDfaMatcher : public DfaSteppingBase { std::span _target_as_utf8; std::span _target_utf8_char_offsets; const bool _is_cased; + const bool _is_prefix; ImplicitDfaMatcher(std::span u32_str, std::span target_as_utf8, std::span target_utf8_char_offsets, - bool is_cased) noexcept + bool is_cased, + bool is_prefix) noexcept : Base(u32_str), _target_as_utf8(target_as_utf8), _target_utf8_char_offsets(target_utf8_char_offsets), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) {} // start, is_match, can_match, match_edit_distance are all provided by base type bool is_cased() const noexcept { return _is_cased; } + bool is_prefix() const noexcept { return _is_prefix; } + template bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) { for (uint32_t i = 0; i < state.size(); ++i) { @@ -137,21 +142,21 @@ struct ImplicitDfaMatcher : public DfaSteppingBase { template LevenshteinDfa::MatchResult ImplicitLevenshteinDfa::match(std::string_view u8str) const { - ImplicitDfaMatcher matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); + ImplicitDfaMatcher matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str); } template LevenshteinDfa::MatchResult ImplicitLevenshteinDfa::match(std::string_view u8str, std::string& successor_out) const { - ImplicitDfaMatcher matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); + ImplicitDfaMatcher matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str, successor_out); } template LevenshteinDfa::MatchResult ImplicitLevenshteinDfa::match(std::string_view u8str, std::vector& successor_out) const { - ImplicitDfaMatcher matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); + ImplicitDfaMatcher matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str, successor_out); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp index 19c2fffbb3e..468619c8036 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp @@ -41,34 +41,40 @@ void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const { _impl->dump_as_graphviz(out); } -LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) { +LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, + Casing casing, DfaType dfa_type, Matching matching) { if (max_edits != 1 && max_edits != 2) { throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits)); } - const bool is_cased = (casing == Casing::Cased); + const bool is_cased = (casing == Casing::Cased); + const bool is_prefix = (matching == Matching::Prefix); auto target_string_u32 = is_cased ? utf8_string_to_utf32(target_string) : utf8_string_to_utf32_lowercased(target_string); if (dfa_type == DfaType::Implicit) { if (max_edits == 1) { - return LevenshteinDfa(std::make_unique>>(std::move(target_string_u32), is_cased)); + return LevenshteinDfa(std::make_unique>>(std::move(target_string_u32), is_cased, is_prefix)); } else { // max_edits == 2 - return LevenshteinDfa(std::make_unique>>(std::move(target_string_u32), is_cased)); + return LevenshteinDfa(std::make_unique>>(std::move(target_string_u32), is_cased, is_prefix)); } } else if(dfa_type == DfaType::Explicit) { if (max_edits == 1) { - return ExplicitLevenshteinDfaBuilder>(std::move(target_string_u32), is_cased).build_dfa(); + return ExplicitLevenshteinDfaBuilder>(std::move(target_string_u32), is_cased, is_prefix).build_dfa(); } else { // max_edits == 2 - return ExplicitLevenshteinDfaBuilder>(std::move(target_string_u32), is_cased).build_dfa(); + return ExplicitLevenshteinDfaBuilder>(std::move(target_string_u32), is_cased, is_prefix).build_dfa(); } } else { // DfaType::Table if (max_edits == 1) { - return LevenshteinDfa(std::make_unique>(std::move(target_string_u32), is_cased)); + return LevenshteinDfa(std::make_unique>(std::move(target_string_u32), is_cased, is_prefix)); } else { // max_edits == 2 - return LevenshteinDfa(std::make_unique>(std::move(target_string_u32), is_cased)); + return LevenshteinDfa(std::make_unique>(std::move(target_string_u32), is_cased, is_prefix)); } } } +LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) { + return build(target_string, max_edits, casing, dfa_type, Matching::FullString); +} + LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) { // TODO automatically select implementation based on target length/max edits? // Suggestion: @@ -112,4 +118,12 @@ std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c) { return os; } +std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Matching m) { + switch (m) { + case LevenshteinDfa::Matching::FullString: os << "FullString"; return os; + case LevenshteinDfa::Matching::Prefix: os << "Prefix"; return os; + } + abort(); +} + } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h index 44c62bdb957..feace39b313 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -265,6 +265,33 @@ public: Cased }; + enum class Matching { + /** + * Edit distance is computed based on the _entire_ source string. Matching is + * symmetric between source and target strings, i.e. match(x, y) and match(y, x) + * will yield the same result. + */ + FullString, + /** + * Edit distance is computed based on a _prefix_ of the source string, as compared + * against the target string. Matching is therefore _asymmetric_ between source and + * target strings. + * + * Example of matching source strings against the target 'ban' (i.e. the prefix query) + * and 1 max edit distance: + * 'bananas' - 0 edits (source prefix 'ban' exact-matches 'ban') + * 'balloons' - 1 edit ('bal' vs 'ban') + * '2bananas' - 1 edit ('2ban' vs 'ban') + * 'boonanas' - mismatch (2 edits) + * + * Note that Prefix matching will match a lot more strings than FullString, so in + * practice it should be combined with prefix _locking_ to constrain the candidate + * result set to a more reasonable cardinality. In particular, max edits >= |target| + * will match _every_ source string trivially. + */ + Prefix + }; + /** * Builds and returns a Levenshtein DFA that matches all strings within `max_edits` * edits of `target_string`. The type of DFA returned is specified by dfa_type. @@ -273,6 +300,9 @@ public: * * `target_string` must not contain any null UTF-8 chars. */ + [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits, + Casing casing, DfaType dfa_type, Matching matching); + [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type); @@ -301,5 +331,6 @@ public: std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos); std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt); std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c); +std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Matching m); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp index 4658635d880..f5b8d0fc0b8 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp @@ -3,31 +3,46 @@ #include "levenshtein_distance.h" +#include #include #include +namespace vespalib { + std::optional -vespalib::LevenshteinDistance::calculate(std::span left, std::span right, uint32_t threshold) +LevenshteinDistance::calculate(std::span left, std::span right, + uint32_t threshold, bool prefix_match) { + assert(left.size() <= static_cast(INT32_MAX)); + assert(right.size() <= static_cast(INT32_MAX)); + threshold = std::min(threshold, static_cast(std::numeric_limits::max())); uint32_t n = left.size(); uint32_t m = right.size(); - if (n > m) { - return calculate(right, left, threshold); - } - - // if one string is empty, the edit distance is necessarily the length - // of the other - if (n == 0) { - return m <= threshold ? std::optional(m) : std::nullopt; - } - if (m == 0) { - return n <= threshold ? std::optional(n) : std::nullopt; - } - - // the edit distance cannot be less than the length difference - if (m - n > threshold) { - return std::nullopt; + if (!prefix_match) { + // These optimizations are only valid when matching with target/source string symmetry. + // Correctness of the main matrix calculation loop should not depend on these. + if (n > m) { + return calculate(right, left, threshold, false); + } + // if one string is empty, the edit distance is necessarily the length + // of the other. + if (n == 0) { + return m <= threshold ? std::optional(m) : std::nullopt; + } + if (m == 0) { + return n <= threshold ? std::optional(n) : std::nullopt; + } + // the edit distance cannot be less than the length difference + if (m - n > threshold) { + return std::nullopt; + } + } else { + // A source (right) cannot be transformed into a target prefix (left) if doing + // so would require inserting more than max edits number of characters. + if ((n > m) && (n - m > threshold)) { + return std::nullopt; + } } std::vector p(n+1); // 'previous' cost array, horizontally @@ -48,6 +63,7 @@ vespalib::LevenshteinDistance::calculate(std::span left, std::sp } // iterates through t + uint32_t min_edits = n; // prefix matching: worst-case to transform to target for (uint32_t j = 1; j <= m; ++j) { uint32_t rightJ = right[j - 1]; // jth character of right d[0] = j; @@ -56,9 +72,9 @@ vespalib::LevenshteinDistance::calculate(std::span left, std::sp uint32_t max = j > std::numeric_limits::max() - threshold ? n : std::min(n, j + threshold); - // ignore entry left of leftmost if (min > 1) { + assert(static_cast(min) <= d.size()); d[min - 1] = std::numeric_limits::max(); } @@ -76,12 +92,35 @@ vespalib::LevenshteinDistance::calculate(std::span left, std::sp lowerBound = std::min(lowerBound, d[i]); } if (lowerBound > threshold) { - return std::nullopt; + if (!prefix_match) { + return std::nullopt; // Cannot match + } else { + break; // May already have matched via min_edits + } } std::swap(p, d); + // For prefix matching: + // By definition, the Levenshtein matrix cell at row `i`, column `j` + // provides the minimum number of edits required to transform a prefix of + // source string S (up to and including length `i`) into a prefix of target + // string T (up to and including length `j`). Since we want to match against + // the entire target (prefix query) string with length `n`, the problem is + // reduced to finding the minimum value of the `n`th column that is `<= k` + // (aggregated here and checked after the loop). + min_edits = std::min(p[n], min_edits); } - if (p[n] <= threshold) { + if (prefix_match) { + return ((min_edits <= threshold) ? std::optional{min_edits} : std::nullopt); + } else if (p[n] <= threshold) { return {p[n]}; } return std::nullopt; } + +std::optional +LevenshteinDistance::calculate(std::span left, std::span right, uint32_t threshold) +{ + return calculate(left, right, threshold, false); +} + +} // vespalib diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h index f105dcdaaf4..e284e071731 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h @@ -19,7 +19,15 @@ namespace vespalib { */ class LevenshteinDistance { public: - static std::optional calculate(std::span left, std::span right, uint32_t threshold); + // Iff `prefix_match` == true, `right` is the candidate to match against prefix `left` + static std::optional calculate(std::span left, + std::span right, + uint32_t threshold, + bool prefix_match); + + static std::optional calculate(std::span left, + std::span right, + uint32_t threshold); }; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp index 388099a0358..e8b1bb5b415 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp @@ -25,11 +25,14 @@ struct MatchAlgorithm { static constexpr uint8_t max_edits() noexcept { return MaxEdits; } /** - * Matches UTF-8 source string `source` against the target DFA, optionally generating + * Matches UTF-8/32 source string `source` against the target DFA, optionally generating * the successor string iff the source string is not within the maximum number of edits * of the target string. * - * The actual match loop is very simple: we try to match the DFA as far as we can + * The actual match loop is very simple, but with some subtle differences in semantics + * depending on whether the "full string" or "prefix" matching mode is used. + * + * For the full string matching mode, we try to match the DFA as far as we can * before either consuming all input (source string) characters or ending up in a non- * matching state before we have consumed all input. In the former case, we may be in * a matching state (consider matching "foo" with the target string "food"; after @@ -39,6 +42,15 @@ struct MatchAlgorithm { * If we end up in a matching state, all is well. We simply return a MatchResult with * the number of edits the state represents. * + * Prefix matching is very similar, but since we're interested in the number of edits + * required to transform a _prefix_ of the source into the target string, we cannot + * require matching the entire source string in the case that it is longer than the + * target (as that would not just be a prefix, after all). We consider a source prefix + * to be matched if _any_ DFA state is a match. This may be the initial state. + * Since the first match might not yield the minimum edit distance, we traverse the + * DFA until it no longer matches and return the smallest encountered distance as the + * final result. + * * The interesting bit happens the string does _not_ match and we are asked to provide a * _successor_ string that _does_ match and is strictly greater in lexicographic order. * @@ -143,6 +155,18 @@ struct MatchAlgorithm { * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different * ordering when compared byte-wise against an implicitly lowercased dictionary. * + * The successor generation algorithm generalizes without changes to work as expected + * when prefix matching is used. This is because the target string represents the + * prefix and the successor consequently represents the smallest possible greater prefix. + * + * Proof by contradiction: S is a source string that does _not_ prefix match target + * string T, and S' is the returned successor. Assume that discarding (skipping) all + * candidate source strings C with S < C < S' misses at least one such candidate C' + * that has a prefix matching T. This means that it must be possible to traverse the + * DFA with source string C' and end up in a matching state. But by definition the + * successor S' is the smallest possible lexicographically greater string that can + * possibly match the DFA for T; a contradiction. + * * TODO let matcher know if source string is pre-normalized (i.e. lowercased). */ template @@ -158,6 +182,9 @@ struct MatchAlgorithm { StateType state = matcher.start(); while (u8_reader.hasMore()) { + if (matcher.is_prefix() && matcher.is_match(state)) { + return minimal_matching_prefix_distance(matcher, state, u8_reader); + } const auto pos_before_char = static_cast(successor_out.size()); const uint32_t raw_mch = u8_reader.getChar(); const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased()); @@ -198,6 +225,9 @@ struct MatchAlgorithm { Utf8Reader u8_reader(source.data(), source.size()); StateType state = matcher.start(); while (u8_reader.hasMore()) { + if (matcher.is_prefix() && matcher.is_match(state)) { + return minimal_matching_prefix_distance(matcher, state, u8_reader); + } const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased()); auto maybe_next = matcher.match_input(state, mch); if (matcher.can_match(maybe_next)) { @@ -213,6 +243,41 @@ struct MatchAlgorithm { return MatchResult::make_mismatch(max_edits()); } + /** + * Follows the DFA (in an already matching state) until it no longer matches, keeping + * track of the minimum edit distance encountered. This is used to compute the minimum + * number of edits during prefix matching, as just returning the edit distance of the + * _first_ matching state may not result in the minimum. + * + * Example 1: matching the source string 'bananas' against the prefix target 'ban' with + * max edits 2 will be in a matching state already after consuming 'b' (can append 2 + * characters after), but had we processed the remaining characters we would end up with + * the actual prefix edit distance of 0. + * + * Example 2: matching the source string 'abcdef' against target 'acd' with max edits 2 + * will be in a matching state after 'a' (2 edits), but the minimal edit prefix is 'abcd' + * (1 edit). This demonstrates that we may have to process more than |target| number of + * source characters to get a minimal distance. + */ + template + static MatchResult minimal_matching_prefix_distance(const Matcher& matcher, + typename Matcher::StateType state, + Utf8Reader& u8_reader) + { + auto min_edits = matcher.match_edit_distance(state); + while (u8_reader.hasMore()) { + const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased()); + auto maybe_next = matcher.match_input(state, mch); + if (matcher.can_match(maybe_next)) { + state = maybe_next; + min_edits = std::min(min_edits, matcher.match_edit_distance(state)); + } else { + break; + } + } + return MatchResult::make_match(max_edits(), min_edits); + } + /** * Instantly backtrack to the last possible branching point in the DFA where we can * choose some higher outgoing edge character value and still match the DFA. If the node diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h index 4ad203c9a46..a15f321f32d 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h @@ -46,12 +46,13 @@ public: private: const std::vector _lookup; const bool _is_cased; + const bool _is_prefix; static std::vector make_lookup(const std::vector &str); public: using MatchResult = LevenshteinDfa::MatchResult; - TableDfa(std::vector str, bool is_cased); + TableDfa(std::vector str, bool is_cased, bool is_prefix); ~TableDfa() override; [[nodiscard]] MatchResult match(std::string_view source) const override; [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override; diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp index e7c721f644e..721cfe296d2 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp @@ -353,11 +353,13 @@ struct TableMatcher { const TableDfa::Lookup *lookup; const uint32_t end; const bool cased; + const bool prefix; - TableMatcher(const TableDfa::Lookup *lookup_in, uint32_t end_in, bool cased_in) - noexcept : lookup(lookup_in), end(end_in), cased(cased_in) {} + TableMatcher(const TableDfa::Lookup *lookup_in, uint32_t end_in, bool cased_in, bool prefix_in) + noexcept : lookup(lookup_in), end(end_in), cased(cased_in), prefix(prefix_in) {} bool is_cased() const noexcept { return cased; } + bool is_prefix() const noexcept { return prefix; } static constexpr S start() noexcept { return S::start(); } uint8_t match_edit_distance(S s) const noexcept { return s.edits(end); } @@ -473,9 +475,10 @@ TableDfa::make_lookup(const std::vector &str)->std::vector } template -TableDfa::TableDfa(std::vector str, bool is_cased) +TableDfa::TableDfa(std::vector str, bool is_cased, bool is_prefix) : _lookup(make_lookup(str)), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) { } @@ -486,7 +489,7 @@ template LevenshteinDfa::MatchResult TableDfa::match(std::string_view u8str) const { - TableMatcher matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + TableMatcher matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str); } @@ -494,7 +497,7 @@ template LevenshteinDfa::MatchResult TableDfa::match(std::string_view u8str, std::string& successor_out) const { - TableMatcher matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + TableMatcher matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str, successor_out); } @@ -502,7 +505,7 @@ template LevenshteinDfa::MatchResult TableDfa::match(std::string_view u8str, std::vector& successor_out) const { - TableMatcher matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + TableMatcher matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix); return MatchAlgorithm::match(matcher, u8str, successor_out); } -- cgit v1.2.3