Merge pull request #30932 from vespa-engine/vekterli/levenshtein-prefix-matching-algo-support

Add prefix match support to Levenshtein algorithm implementations
author: Tor Brede Vekterli <vekterli@vespa.ai> 2024-04-19 12:47:00 +0200
committer: GitHub <noreply@github.com> 2024-04-19 12:47:00 +0200
commit: 06b7bccd5586ed353069593a71535e4c958bc63e (patch)
tree: 40c7b3d9a9759bdffa1bff6c862dd6546e80c8cd /vespalib
parent: bfa5d861a4492f9935c16a1596b30f7889143455 (diff)
parent: 0e8aab2c447e0ed5cac035d385c95ad44d8594d6 (diff)
14 files changed, 379 insertions, 69 deletions
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index 69b34ece2c7..dfd1b3b0c3b 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -24,12 +24,13 @@ using CasingAndDfaType = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaT
 namespace {
 
 std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold,
-                                     LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+                                     LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type,
+                                     LevenshteinDfa::Matching matching)
 {
-    auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type);
+    auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type, matching);
     auto maybe_match_lhs = dfa_lhs.match(right);
 
-    auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type);
+    auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type, matching);
     auto maybe_match_rhs = dfa_rhs.match(left);
 
     EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
@@ -40,6 +41,12 @@ std::optional<uint32_t> do_calculate(std::string_view left, std::string_view rig
     return std::nullopt;
 }
 
+std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold,
+                                     LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+{
+    return do_calculate(left, right, threshold, casing, dfa_type, LevenshteinDfa::Matching::FullString);
+}
+
 std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold,
                                      LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
 {
@@ -74,6 +81,16 @@ struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> {
         return do_calculate(left, right, threshold, casing(), dfa_type());
     }
 
+    static std::optional<uint32_t> prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+        // Prefix matching is not symmetric, cannot use do_calculate() as it implicitly checks this.
+        auto dfa = LevenshteinDfa::build(left, threshold, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix);
+        auto maybe_match = dfa.match(right);
+        if (maybe_match.matches()) {
+            return {maybe_match.edits()};
+        }
+        return std::nullopt;
+    }
+
 };
 
 // All the baseline DFA tests use lowercase only, so they should have the exact same outcome
@@ -98,6 +115,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) {
         EXPECT_EQ(calculate("abc", "ab", max),   std::optional{1}) << max;
         EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max;
         EXPECT_EQ(calculate("a", "", max),       std::optional{1}) << max;
+        EXPECT_EQ(calculate("", "", max),        std::optional{0}) << max;
     }
     EXPECT_EQ(calculate("bc", "abcd", 2),  std::optional{2});
     EXPECT_EQ(calculate("ab", "abcd", 2),  std::optional{2});
@@ -109,6 +127,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) {
     EXPECT_EQ(calculate("ab", "", 2),      std::optional{2});
     EXPECT_EQ(calculate("abc", "", 2),     std::nullopt);
     EXPECT_EQ(calculate("abc", "123", 2),  std::nullopt);
+    EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt);
 }
 
 TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
@@ -124,6 +143,37 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
     EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt);
 }
 
+TEST_P(LevenshteinDfaTest, can_match_in_target_string_prefix_mode) {
+    for (auto max : {1, 2}) {
+        EXPECT_EQ(prefix_calculate("",    "literally anything", max), std::optional{0});
+        EXPECT_EQ(prefix_calculate("",    "", max),        std::optional{0});
+        EXPECT_EQ(prefix_calculate("x",   "", max),        std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abc", max),     std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "abcd", max),    std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "abcdef", max),  std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "ab", max),      std::optional{1});
+        EXPECT_EQ(prefix_calculate("ac",  "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("acd", "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("bc",  "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "acb", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "acdefg", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("acb", "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abd", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xbc", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xy", max),      std::nullopt);
+    }
+    EXPECT_EQ(prefix_calculate("abc", "xxabc", 2),   std::optional{2});
+    EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2),  std::optional{2});
+    EXPECT_EQ(prefix_calculate("abcxx", "abc", 2),   std::optional{2});
+    EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2),  std::optional{2});
+    EXPECT_EQ(prefix_calculate("xy",  "", 2), std::optional{2});
+    EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt);
+}
+
 void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source,
                         std::string_view expected_successor, std::string_view successor_prefix)
 {
@@ -182,11 +232,22 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings
                             "\xf0\x9f\xa4\xa9""food");    // <starry eyed emoji>food
 
     // Note that as a general rule, emojis are fickle beasts to deal with since a single
-    // emoji often takes up multiple code points, which we consider separate characters
-    // but a user sees as a single actual rendered glyph.
+    // emoji often takes up multiple code points. We consider these as separate characters
+    // but from a Unicode perspective they are considered part of an "extended grapheme
+    // cluster", from which the actually rendered glyph is derived.
     // Multi-code point character edit distance support is left as an exercise for the reader :D
 }
 
+TEST_P(LevenshteinDfaTest, can_generate_successors_with_prefix_match_mode) {
+    auto dfa = LevenshteinDfa::build("ban", 1, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix);
+    // There is no difference at all in successor output when using Prefix mode as opposed to
+    // FullString mode, but since we match _more_ source strings in prefix mode we cannot
+    // simply run the same test set for both modes. This just tests that the wiring seems fine.
+    test_dfa_successor(dfa, "",      "\x01""an");
+    test_dfa_successor(dfa, "xy",    "yan");
+    test_dfa_successor(dfa, "bolle", "bon");
+}
+
 TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) {
     auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
     // The successor string must be lexicographically larger than the input string.
@@ -332,13 +393,21 @@ std::string bits_to_str(T v) {
     return ret;
 }
 
-using CasingAndDfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType, uint32_t>;
+using CasingAndDfaTypeAndMaxEdits = std::tuple<
+    LevenshteinDfa::Casing,
+    LevenshteinDfa::DfaType,
+    LevenshteinDfa::Matching,
+    uint32_t
+>;
 
 struct LevenshteinDfaSuccessorTest : TestWithParam<CasingAndDfaTypeAndMaxEdits> {
-    // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number.
+    // Print test suffix as e.g. "/Uncased_Explicit_FullString_1" instead of just a GTest-chosen number.
     static std::string stringify_params(const TestParamInfo<ParamType>& info) {
         std::ostringstream ss;
-        ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param);
+        ss << std::get<0>(info.param)
+           << '_' << std::get<1>(info.param)
+           << '_' << std::get<2>(info.param)
+           << '_' << std::get<3>(info.param);
         return ss.str();
     }
 };
@@ -350,6 +419,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
                                  Values(LevenshteinDfa::DfaType::Explicit,
                                         LevenshteinDfa::DfaType::Implicit,
                                         LevenshteinDfa::DfaType::Table),
+                                 Values(LevenshteinDfa::Matching::FullString,
+                                        LevenshteinDfa::Matching::Prefix),
                                  Values(1, 2)),
                          LevenshteinDfaSuccessorTest::stringify_params);
 
@@ -369,10 +440,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
  * Inspired by approach used by Lucene DFA exhaustive testing.
  */
 TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
-    const auto [casing, dfa_type, max_edits] = GetParam();
+    const auto [casing, dfa_type, matching, max_edits] = GetParam();
     for (uint32_t i = 0; i < 256; ++i) {
         const auto target = bits_to_str(static_cast<uint8_t>(i));
-        auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type);
+        auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type, matching);
         std::string skip_to, successor;
         for (uint32_t j = 0; j < 256; ++j) {
             const auto source = bits_to_str(static_cast<uint8_t>(j));
diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
index 37d1d4b4c54..918cbc624db 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
@@ -16,6 +16,13 @@ std::optional<uint32_t> calculate(std::string_view left, std::string_view right,
     return leftRight;
 }
 
+// Prefix matching is asymmetric and therefore cannot implicitly test result symmetry
+std::optional<uint32_t> prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+    auto left_codepoints  = vespalib::LowerCase::convert_to_ucs4(left);
+    auto right_codepoints = vespalib::LowerCase::convert_to_ucs4(right);
+    return vespalib::LevenshteinDistance::calculate(left_codepoints, right_codepoints, threshold, true);
+}
+
 TEST(LevenshteinDistance, calculate_edgecases) {
     EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0});
     EXPECT_EQ(calculate("abc", "ab1", 2), std::optional{1});
@@ -33,6 +40,54 @@ TEST(LevenshteinDistance, calculate_edgecases) {
     EXPECT_EQ(calculate("ab", "", 2), std::optional{2});
     EXPECT_EQ(calculate("abc", "", 2), std::nullopt);
     EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+    EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt);
+}
+
+TEST(LevenshteinDistance, prefix_match_edge_cases) {
+    // Same cases as LevenshteinDfaTest (TODO consolidate these somehow)
+    for (auto max : {1, 2}) {
+        EXPECT_EQ(prefix_calculate("",    "literally anything", max), std::optional{0});
+        EXPECT_EQ(prefix_calculate("",    "", max),        std::optional{0});
+        EXPECT_EQ(prefix_calculate("x",   "", max),        std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abc", max),     std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "abcd", max),    std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "abcdef", max),  std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "ab", max),      std::optional{1});
+        EXPECT_EQ(prefix_calculate("ac",  "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("acd", "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("bc",  "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "acb", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "acdefg", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("acb", "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abd", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xbc", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xy", max),      std::nullopt);
+    }
+    EXPECT_EQ(prefix_calculate("abc", "xxabc", 2),   std::optional{2});
+    EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2),  std::optional{2});
+    EXPECT_EQ(prefix_calculate("abcxx", "abc", 2),   std::optional{2});
+    EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2),  std::optional{2});
+    EXPECT_EQ(prefix_calculate("xy",  "", 2), std::optional{2});
+    EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt);
+
+    // Max edits > 2 cases; not supported by DFA implementation.
+    EXPECT_EQ(prefix_calculate("abc", "", 3),      std::optional{3});
+    EXPECT_EQ(prefix_calculate("abc", "xy", 3),    std::optional{3});
+    EXPECT_EQ(prefix_calculate("abc", "xyz", 3),   std::optional{3});
+    EXPECT_EQ(prefix_calculate("abc", "xyzzz", 3), std::optional{3});
+    EXPECT_EQ(prefix_calculate("abcd", "xyzd", 3), std::optional{3});
+    EXPECT_EQ(prefix_calculate("abcd", "xyzz", 3), std::nullopt);
+    EXPECT_EQ(prefix_calculate("abcd", "", 3),     std::nullopt);
+}
+
+TEST(LevenshteinDistance, oversized_max_edits_is_well_defined) {
+    const auto k = uint32_t(INT32_MAX) + 10000u;
+    EXPECT_EQ(calculate("abc", "xyz", k), std::optional{3});
+    EXPECT_EQ(prefix_calculate("abc", "xyzzzz", k), std::optional{3});
 }
 
 GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
index c69f414aae6..8b2597dd179 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
@@ -20,6 +20,12 @@ concept DfaMatcher = requires(T a, std::string u8str, std::vector<uint32_t> u32s
     // matching to have the expected semantics, the actual target string must be pre-lowercased.
     { a.is_cased() } -> std::same_as<bool>;
 
+    // Whether the matching is performed in prefix mode. In prefix mode, a source string is
+    // considered a match if it matches the target string at any point during DFA traversal.
+    // I.e. the source string "bananas" prefix-matched against the target (prefix) "ban" will
+    // be returned as a match with 0 edits.
+    { a.is_prefix() } -> std::same_as<bool>;
+
     // Initial (starting) state of the DFA
     { a.start() } -> std::same_as<typename T::StateType>;
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
index 490582b5bf7..a3542d61dc8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -97,9 +97,11 @@ public:
 private:
     std::vector<DfaNodeType> _nodes;
     const bool               _is_cased;
+    const bool               _is_prefix;
 public:
-    explicit ExplicitLevenshteinDfaImpl(bool is_cased) noexcept
-        : _is_cased(is_cased)
+    ExplicitLevenshteinDfaImpl(bool is_cased, bool is_prefix) noexcept
+        : _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {}
     ~ExplicitLevenshteinDfaImpl() override;
 
@@ -140,10 +142,12 @@ template <typename Traits>
 class ExplicitLevenshteinDfaBuilder {
     const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
     const bool                  _is_cased;
+    const bool                  _is_prefix;
 public:
-    ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str, bool is_cased) noexcept
+    ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str, bool is_cased, bool is_prefix) noexcept
         : _u32_str_buf(std::move(str)),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {}
 
     [[nodiscard]] LevenshteinDfa build_dfa() const;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
index 55dd459ff26..11a822d2d7e 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -22,16 +22,20 @@ struct ExplicitDfaMatcher {
 
     const std::span<const DfaNodeType> _nodes;
     const bool                         _is_cased;
+    const bool                         _is_prefix;
 
-    ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes, bool is_cased) noexcept
+    ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes, bool is_cased, bool is_prefix) noexcept
         : _nodes(nodes),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {}
 
     static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
 
     bool is_cased() const noexcept { return _is_cased; }
 
+    bool is_prefix() const noexcept { return _is_prefix; }
+
     StateType start() const noexcept {
         return &_nodes[0];
     }
@@ -99,21 +103,21 @@ ExplicitLevenshteinDfaImpl<MaxEdits>::~ExplicitLevenshteinDfaImpl() = default;
 template <uint8_t MaxEdits>
 LevenshteinDfa::MatchResult
 ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str) const {
-    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
+    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix);
     return MatchAlgorithm<MaxEdits>::match(matcher, u8str);
 }
 
 template <uint8_t MaxEdits>
 LevenshteinDfa::MatchResult
 ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string& successor_out) const {
-    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
+    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix);
     return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
 }
 
 template <uint8_t MaxEdits>
 LevenshteinDfa::MatchResult
 ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
-    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
+    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix);
     return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
 }
 
@@ -199,10 +203,12 @@ class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase<Traits> {
     using Base::transitions;
 
     const bool _is_cased;
+    const bool _is_prefix;
 public:
-    ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str, bool is_cased) noexcept
+    ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str, bool is_cased, bool is_prefix) noexcept
         : DfaSteppingBase<Traits>(str),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {
         assert(str.size() < UINT32_MAX / max_out_edges_per_node());
     }
@@ -217,7 +223,7 @@ public:
 
 template <typename Traits>
 LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
-    auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(_is_cased);
+    auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(_is_cased, _is_prefix);
     ExploreState<StateType> exp;
     // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_
     // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth.
@@ -257,7 +263,7 @@ LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
 
 template <typename Traits>
 LevenshteinDfa ExplicitLevenshteinDfaBuilder<Traits>::build_dfa() const {
-    ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf, _is_cased);
+    ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf, _is_cased, _is_prefix);
     return builder.build_dfa();
 }
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
index 20c8e1b7e2b..552fd849734 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -13,14 +13,16 @@ class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl {
     std::string                 _target_as_utf8;
     std::vector<uint32_t>       _target_utf8_char_offsets;
     const bool                  _is_cased;
+    const bool                  _is_prefix;
 public:
     using MatchResult = LevenshteinDfa::MatchResult;
 
-    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased)
+    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix)
         : _u32_str_buf(std::move(str)),
           _target_as_utf8(),
           _target_utf8_char_offsets(),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {
         precompute_utf8_target_with_offsets();
     }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
index 05ef2761f34..7dadcc59b8e 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -32,21 +32,26 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
     std::span<const char>     _target_as_utf8;
     std::span<const uint32_t> _target_utf8_char_offsets;
     const bool                _is_cased;
+    const bool                _is_prefix;
 
     ImplicitDfaMatcher(std::span<const uint32_t> u32_str,
                        std::span<const char>     target_as_utf8,
                        std::span<const uint32_t> target_utf8_char_offsets,
-                       bool is_cased) noexcept
+                       bool is_cased,
+                       bool is_prefix) noexcept
         : Base(u32_str),
           _target_as_utf8(target_as_utf8),
           _target_utf8_char_offsets(target_utf8_char_offsets),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {}
 
     // start, is_match, can_match, match_edit_distance are all provided by base type
 
     bool is_cased() const noexcept { return _is_cased; }
 
+    bool is_prefix() const noexcept { return _is_prefix; }
+
     template <typename F>
     bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) {
         for (uint32_t i = 0; i < state.size(); ++i) {
@@ -137,21 +142,21 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
 template <typename Traits>
 LevenshteinDfa::MatchResult
 ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str);
 }
 
 template <typename Traits>
 LevenshteinDfa::MatchResult
 ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string& successor_out) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
 }
 
 template <typename Traits>
 LevenshteinDfa::MatchResult
 ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
 }
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index 19c2fffbb3e..468619c8036 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -41,34 +41,40 @@ void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const {
     _impl->dump_as_graphviz(out);
 }
 
-LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) {
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits,
+                                     Casing casing, DfaType dfa_type, Matching matching) {
     if (max_edits != 1 && max_edits != 2) {
         throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits));
     }
-    const bool is_cased = (casing == Casing::Cased);
+    const bool is_cased    = (casing == Casing::Cased);
+    const bool is_prefix   = (matching == Matching::Prefix);
     auto target_string_u32 = is_cased ? utf8_string_to_utf32(target_string)
                                       : utf8_string_to_utf32_lowercased(target_string);
     if (dfa_type == DfaType::Implicit) {
         if (max_edits == 1) {
-            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased));
+            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased, is_prefix));
         } else { // max_edits == 2
-            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased));
+            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased, is_prefix));
         }
     } else if(dfa_type == DfaType::Explicit) {
         if (max_edits == 1) {
-            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa();
+            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased, is_prefix).build_dfa();
         } else { // max_edits == 2
-            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa();
+            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased, is_prefix).build_dfa();
         }
     } else { // DfaType::Table
         if (max_edits == 1) {
-            return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased));
+            return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased, is_prefix));
         } else { // max_edits == 2
-            return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased));
+            return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased, is_prefix));
         }
     }
 }
 
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) {
+    return build(target_string, max_edits, casing, dfa_type, Matching::FullString);
+}
+
 LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) {
     // TODO automatically select implementation based on target length/max edits?
     //  Suggestion:
@@ -112,4 +118,12 @@ std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c) {
     return os;
 }
 
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Matching m) {
+    switch (m) {
+    case LevenshteinDfa::Matching::FullString: os << "FullString"; return os;
+    case LevenshteinDfa::Matching::Prefix:     os << "Prefix"; return os;
+    }
+    abort();
+}
+
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index 44c62bdb957..feace39b313 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -265,6 +265,33 @@ public:
         Cased
     };
 
+    enum class Matching {
+        /**
+         * Edit distance is computed based on the _entire_ source string. Matching is
+         * symmetric between source and target strings, i.e. match(x, y) and match(y, x)
+         * will yield the same result.
+         */
+        FullString,
+        /**
+         * Edit distance is computed based on a _prefix_ of the source string, as compared
+         * against the target string. Matching is therefore _asymmetric_ between source and
+         * target strings.
+         *
+         * Example of matching source strings against the target 'ban' (i.e. the prefix query)
+         * and 1 max edit distance:
+         *   'bananas'  - 0 edits (source prefix 'ban' exact-matches 'ban')
+         *   'balloons' - 1 edit ('bal' vs 'ban')
+         *   '2bananas' - 1 edit ('2ban' vs 'ban')
+         *   'boonanas' - mismatch (2 edits)
+         *
+         * Note that Prefix matching will match a lot more strings than FullString, so in
+         * practice it should be combined with prefix _locking_ to constrain the candidate
+         * result set to a more reasonable cardinality. In particular, max edits >= |target|
+         * will match _every_ source string trivially.
+         */
+        Prefix
+    };
+
     /**
      * Builds and returns a Levenshtein DFA that matches all strings within `max_edits`
      * edits of `target_string`. The type of DFA returned is specified by dfa_type.
@@ -274,6 +301,9 @@ public:
      * `target_string` must not contain any null UTF-8 chars.
      */
     [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits,
+                                              Casing casing, DfaType dfa_type, Matching matching);
+
+    [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits,
                                               Casing casing, DfaType dfa_type);
 
     /**
@@ -301,5 +331,6 @@ public:
 std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos);
 std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt);
 std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c);
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Matching m);
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
index 4658635d880..f5b8d0fc0b8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
@@ -3,31 +3,46 @@
 
 #include "levenshtein_distance.h"
 
+#include <cassert>
 #include <limits>
 #include <vector>
 
+namespace vespalib {
+
 std::optional<uint32_t>
-vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold)
+LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right,
+                               uint32_t threshold, bool prefix_match)
 {
+    assert(left.size() <= static_cast<size_t>(INT32_MAX));
+    assert(right.size() <= static_cast<size_t>(INT32_MAX));
+    threshold = std::min(threshold, static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
     uint32_t n = left.size();
     uint32_t m = right.size();
 
-    if (n > m) {
-        return calculate(right, left, threshold);
-    }
-
-    // if one string is empty, the edit distance is necessarily the length
-    // of the other
-    if (n == 0) {
-        return m <= threshold ? std::optional(m) : std::nullopt;
-    }
-    if (m == 0) {
-        return n <= threshold ? std::optional(n) : std::nullopt;
-    }
-
-    // the edit distance cannot be less than the length difference
-    if (m - n > threshold) {
-        return std::nullopt;
+    if (!prefix_match) {
+        // These optimizations are only valid when matching with target/source string symmetry.
+        // Correctness of the main matrix calculation loop should not depend on these.
+        if (n > m) {
+            return calculate(right, left, threshold, false);
+        }
+        // if one string is empty, the edit distance is necessarily the length
+        // of the other.
+        if (n == 0) {
+            return m <= threshold ? std::optional(m) : std::nullopt;
+        }
+        if (m == 0) {
+            return n <= threshold ? std::optional(n) : std::nullopt;
+        }
+        // the edit distance cannot be less than the length difference
+        if (m - n > threshold) {
+            return std::nullopt;
+        }
+    } else {
+        // A source (right) cannot be transformed into a target prefix (left) if doing
+        // so would require inserting more than max edits number of characters.
+        if ((n > m) && (n - m > threshold)) {
+            return std::nullopt;
+        }
     }
 
     std::vector<uint32_t> p(n+1); // 'previous' cost array, horizontally
@@ -48,6 +63,7 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp
     }
 
     // iterates through t
+    uint32_t min_edits = n; // prefix matching: worst-case to transform to target
     for (uint32_t j = 1; j <= m; ++j) {
         uint32_t rightJ = right[j - 1]; // jth character of right
         d[0] = j;
@@ -56,9 +72,9 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp
 
         uint32_t max = j > std::numeric_limits<uint32_t>::max() - threshold ?
                        n : std::min(n, j + threshold);
-
         // ignore entry left of leftmost
         if (min > 1) {
+            assert(static_cast<size_t>(min) <= d.size());
             d[min - 1] = std::numeric_limits<uint32_t>::max();
         }
 
@@ -76,12 +92,35 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp
             lowerBound = std::min(lowerBound, d[i]);
         }
         if (lowerBound > threshold) {
-            return std::nullopt;
+            if (!prefix_match) {
+                return std::nullopt; // Cannot match
+            } else {
+                break; // May already have matched via min_edits
+            }
         }
         std::swap(p, d);
+        // For prefix matching:
+        // By definition, the Levenshtein matrix cell at row `i`, column `j`
+        // provides the minimum number of edits required to transform a prefix of
+        // source string S (up to and including length `i`) into a prefix of target
+        // string T (up to and including length `j`). Since we want to match against
+        // the entire target (prefix query) string with length `n`, the problem is
+        // reduced to finding the minimum value of the `n`th column that is `<= k`
+        // (aggregated here and checked after the loop).
+        min_edits = std::min(p[n], min_edits);
     }
-    if (p[n] <= threshold) {
+    if (prefix_match) {
+        return ((min_edits <= threshold) ? std::optional<uint32_t>{min_edits} : std::nullopt);
+    } else if (p[n] <= threshold) {
         return {p[n]};
     }
     return std::nullopt;
 }
+
+std::optional<uint32_t>
+LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold)
+{
+    return calculate(left, right, threshold, false);
+}
+
+} // vespalib
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
index f105dcdaaf4..e284e071731 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
@@ -19,7 +19,15 @@ namespace vespalib {
  */
 class LevenshteinDistance {
 public:
-    static std::optional<uint32_t> calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold);
+    // Iff `prefix_match` == true, `right` is the candidate to match against prefix `left`
+    static std::optional<uint32_t> calculate(std::span<const uint32_t> left,
+                                             std::span<const uint32_t> right,
+                                             uint32_t threshold,
+                                             bool prefix_match);
+
+    static std::optional<uint32_t> calculate(std::span<const uint32_t> left,
+                                             std::span<const uint32_t> right,
+                                             uint32_t threshold);
 };
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index 388099a0358..e8b1bb5b415 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -25,11 +25,14 @@ struct MatchAlgorithm {
     static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
 
     /**
-     * Matches UTF-8 source string `source` against the target DFA, optionally generating
+     * Matches UTF-8/32 source string `source` against the target DFA, optionally generating
      * the successor string iff the source string is not within the maximum number of edits
      * of the target string.
      *
-     * The actual match loop is very simple: we try to match the DFA as far as we can
+     * The actual match loop is very simple, but with some subtle differences in semantics
+     * depending on whether the "full string" or "prefix" matching mode is used.
+     *
+     * For the full string matching mode, we try to match the DFA as far as we can
      * before either consuming all input (source string) characters or ending up in a non-
      * matching state before we have consumed all input. In the former case, we may be in
      * a matching state (consider matching "foo" with the target string "food"; after
@@ -39,6 +42,15 @@ struct MatchAlgorithm {
      * If we end up in a matching state, all is well. We simply return a MatchResult with
      * the number of edits the state represents.
      *
+     * Prefix matching is very similar, but since we're interested in the number of edits
+     * required to transform a _prefix_ of the source into the target string, we cannot
+     * require matching the entire source string in the case that it is longer than the
+     * target (as that would not just be a prefix, after all). We consider a source prefix
+     * to be matched if _any_ DFA state is a match. This may be the initial state.
+     * Since the first match might not yield the minimum edit distance, we traverse the
+     * DFA until it no longer matches and return the smallest encountered distance as the
+     * final result.
+     *
      * The interesting bit happens the string does _not_ match and we are asked to provide a
      * _successor_ string that _does_ match and is strictly greater in lexicographic order.
      *
@@ -143,6 +155,18 @@ struct MatchAlgorithm {
      * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
      * ordering when compared byte-wise against an implicitly lowercased dictionary.
      *
+     * The successor generation algorithm generalizes without changes to work as expected
+     * when prefix matching is used. This is because the target string represents the
+     * prefix and the successor consequently represents the smallest possible greater prefix.
+     *
+     * Proof by contradiction: S is a source string that does _not_ prefix match target
+     * string T, and S' is the returned successor. Assume that discarding (skipping) all
+     * candidate source strings C with S < C < S' misses at least one such candidate C'
+     * that has a prefix matching T. This means that it must be possible to traverse the
+     * DFA with source string C' and end up in a matching state. But by definition the
+     * successor S' is the smallest possible lexicographically greater string that can
+     * possibly match the DFA for T; a contradiction.
+     *
      * TODO let matcher know if source string is pre-normalized (i.e. lowercased).
      */
     template <DfaMatcher Matcher, typename SuccessorT>
@@ -158,6 +182,9 @@ struct MatchAlgorithm {
 
         StateType state = matcher.start();
         while (u8_reader.hasMore()) {
+            if (matcher.is_prefix() && matcher.is_match(state)) {
+                return minimal_matching_prefix_distance(matcher, state, u8_reader);
+            }
             const auto pos_before_char = static_cast<uint32_t>(successor_out.size());
             const uint32_t raw_mch     = u8_reader.getChar();
             const uint32_t mch         = normalized_match_char(raw_mch, matcher.is_cased());
@@ -198,6 +225,9 @@ struct MatchAlgorithm {
         Utf8Reader u8_reader(source.data(), source.size());
         StateType state = matcher.start();
         while (u8_reader.hasMore()) {
+            if (matcher.is_prefix() && matcher.is_match(state)) {
+                return minimal_matching_prefix_distance(matcher, state, u8_reader);
+            }
             const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased());
             auto maybe_next    = matcher.match_input(state, mch);
             if (matcher.can_match(maybe_next)) {
@@ -214,6 +244,41 @@ struct MatchAlgorithm {
     }
 
     /**
+     * Follows the DFA (in an already matching state) until it no longer matches, keeping
+     * track of the minimum edit distance encountered. This is used to compute the minimum
+     * number of edits during prefix matching, as just returning the edit distance of the
+     * _first_ matching state may not result in the minimum.
+     *
+     * Example 1: matching the source string 'bananas' against the prefix target 'ban' with
+     * max edits 2 will be in a matching state already after consuming 'b' (can append 2
+     * characters after), but had we processed the remaining characters we would end up with
+     * the actual prefix edit distance of 0.
+     *
+     * Example 2: matching the source string 'abcdef' against target 'acd' with max edits 2
+     * will be in a matching state after 'a' (2 edits), but the minimal edit prefix is 'abcd'
+     * (1 edit). This demonstrates that we may have to process more than |target| number of
+     * source characters to get a minimal distance.
+     */
+    template <DfaMatcher Matcher>
+    static MatchResult minimal_matching_prefix_distance(const Matcher& matcher,
+                                                        typename Matcher::StateType state,
+                                                        Utf8Reader& u8_reader)
+    {
+        auto min_edits = matcher.match_edit_distance(state);
+        while (u8_reader.hasMore()) {
+            const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased());
+            auto maybe_next    = matcher.match_input(state, mch);
+            if (matcher.can_match(maybe_next)) {
+                state = maybe_next;
+                min_edits = std::min(min_edits, matcher.match_edit_distance(state));
+            } else {
+                break;
+            }
+        }
+        return MatchResult::make_match(max_edits(), min_edits);
+    }
+
+    /**
      * Instantly backtrack to the last possible branching point in the DFA where we can
      * choose some higher outgoing edge character value and still match the DFA. If the node
      * has a wildcard edge, we can bump the input char by one and generate the smallest
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
index 4ad203c9a46..a15f321f32d 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
@@ -46,12 +46,13 @@ public:
 private:
     const std::vector<Lookup> _lookup;
     const bool                _is_cased;
+    const bool                _is_prefix;
 
     static std::vector<Lookup> make_lookup(const std::vector<uint32_t> &str);
     
 public:
     using MatchResult = LevenshteinDfa::MatchResult;
-    TableDfa(std::vector<uint32_t> str, bool is_cased);
+    TableDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix);
     ~TableDfa() override;
     [[nodiscard]] MatchResult match(std::string_view source) const override;
     [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
index e7c721f644e..721cfe296d2 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
@@ -353,11 +353,13 @@ struct TableMatcher {
     const TableDfa<N>::Lookup *lookup;
     const uint32_t             end;
     const bool                 cased;
+    const bool                 prefix;
 
-    TableMatcher(const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in)
-      noexcept : lookup(lookup_in), end(end_in), cased(cased_in) {}
+    TableMatcher(const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in, bool prefix_in)
+      noexcept : lookup(lookup_in), end(end_in), cased(cased_in), prefix(prefix_in) {}
     
     bool is_cased() const noexcept { return cased; }
+    bool is_prefix() const noexcept { return prefix; }
     static constexpr S start() noexcept { return S::start(); }
 
     uint8_t match_edit_distance(S s) const noexcept { return s.edits(end); }
@@ -473,9 +475,10 @@ TableDfa<N>::make_lookup(const std::vector<uint32_t> &str)->std::vector<Lookup>
 }
 
 template <uint8_t N>
-TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased)
+TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix)
   : _lookup(make_lookup(str)),
-    _is_cased(is_cased)
+    _is_cased(is_cased),
+    _is_prefix(is_prefix)
 {
 }
 
@@ -486,7 +489,7 @@ template <uint8_t N>
 LevenshteinDfa::MatchResult
 TableDfa<N>::match(std::string_view u8str) const
 {
-    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix);
     return MatchAlgorithm<N>::match(matcher, u8str);
 }
 
@@ -494,7 +497,7 @@ template <uint8_t N>
 LevenshteinDfa::MatchResult
 TableDfa<N>::match(std::string_view u8str, std::string& successor_out) const
 {
-    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix);
     return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
 }
 
@@ -502,7 +505,7 @@ template <uint8_t N>
 LevenshteinDfa::MatchResult
 TableDfa<N>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const
 {
-    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix);
     return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
 }
author	Tor Brede Vekterli <vekterli@vespa.ai>	2024-04-19 12:47:00 +0200
committer	GitHub <noreply@github.com>	2024-04-19 12:47:00 +0200
commit	06b7bccd5586ed353069593a71535e4c958bc63e (patch)
tree	40c7b3d9a9759bdffa1bff6c862dd6546e80c8cd /vespalib
parent	bfa5d861a4492f9935c16a1596b30f7889143455 (diff)
parent	0e8aab2c447e0ed5cac035d385c95ad44d8594d6 (diff)