From 0e8aab2c447e0ed5cac035d385c95ad44d8594d6 Mon Sep 17 00:00:00 2001
From: Tor Brede Vekterli <vekterli@vespa.ai>
Date: Wed, 3 Apr 2024 09:39:31 +0000
Subject: Add prefix match support to Levenshtein algorithm implementations

Adds support for matching the _prefix_ of a source string against
a target string (the prefix query) within a bounded maximum number
of `k` max edits. Iff the number of edits required is within the
specified bound, returns the _minimum_ number of edits required to
transform the source string prefix to the full target string.

By convention, we treat the target string as the "columnar" string
in the Levenshtein matrix (i.e. its characters are column-indexed,
whereas the source string is row-indexed). This matters for prefix
matching, because unlike regular Levenshtein matching it is _not_
symmetric between source and target strings.

By definition, the Levenshtein matrix cell at row `i`, column `j`
provides the minimum number of edits required to transform a prefix
of source string S (up to and including length `i`) into a prefix of
target string T (up to and including length `j`). Since we want to
match against the entire target (prefix query) string of length `n`,
the problem is reduced to finding the minimum value of the `n`th
column that is `<= k`.

Example: matching the source string `abcdef` against the target `acd`
with `k` == 2:

     a c d
   0 1 2 3
 a 1 0 1 2
 b 2 1 1 2
 c 3 2 1 2
 d 4 3 2 1
 e 5 4 3 2
 f 6 5 4 3

In this case, the _shortest_ matching prefix is simply `a`, but the
_minimum edits_ prefix is `abcd`. The latter prefix's distance is
what we return.

For our generalized (i.e. arbitrary `k`) Levenshtein implementation,
this is implemented fairly straight forward since it operates on
matrix rows already (sparsely populated around the diagonal).

For the DFA implementation(s), transitioning between states in a
Levenshtein DFA is equivalent to explicitly computing the (sparse)
matrix columns around the diagonal for the source character being
currently matched, so the same principle applies directly to it.

Since we don't have any explicit notion of the value of matrix columns
in the abstract DFA, a source string in prefix mode will be considered
a match when _any_ DFA state is a match.

By definition, this is as-if the matrix row represented by the state
has a `n`th column that is <= `k`. We then follow the DFA until it
can no longer match, keeping track of the lowest state edit distance
encountered. This mirrors finding the row whose `n`th column
minimizes `k`.

Prefix matching support has been added to the core DFA matching
loop algorithms, with only very minor changes to the underlying DFA
implementations. Successor generation upon mismatch should work
as expected with no algorithmic changes introduced for prefix
matching. Prefix match mode has been added as a dimension to the
exhaustive successor checking unit test.
---
 vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp  | 91 +++++++++++++++++++---
 .../src/tests/fuzzy/levenshtein_distance_test.cpp  | 55 +++++++++++++
 vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h    |  6 ++
 .../vespalib/fuzzy/explicit_levenshtein_dfa.h      | 12 ++-
 .../vespalib/fuzzy/explicit_levenshtein_dfa.hpp    | 24 +++---
 .../vespalib/fuzzy/implicit_levenshtein_dfa.h      |  6 +-
 .../vespalib/fuzzy/implicit_levenshtein_dfa.hpp    | 15 ++--
 .../src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp   | 30 +++++--
 .../src/vespa/vespalib/fuzzy/levenshtein_dfa.h     | 31 ++++++++
 .../vespa/vespalib/fuzzy/levenshtein_distance.cpp  | 79 ++++++++++++++-----
 .../vespa/vespalib/fuzzy/levenshtein_distance.h    | 10 ++-
 .../src/vespa/vespalib/fuzzy/match_algorithm.hpp   | 69 +++++++++++++++-
 vespalib/src/vespa/vespalib/fuzzy/table_dfa.h      |  3 +-
 vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp    | 17 ++--
 14 files changed, 379 insertions(+), 69 deletions(-)

(limited to 'vespalib')

diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index 69b34ece2c7..dfd1b3b0c3b 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -24,12 +24,13 @@ using CasingAndDfaType = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaT
 namespace {
 
 std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold,
-                                     LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+                                     LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type,
+                                     LevenshteinDfa::Matching matching)
 {
-    auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type);
+    auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type, matching);
     auto maybe_match_lhs = dfa_lhs.match(right);
 
-    auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type);
+    auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type, matching);
     auto maybe_match_rhs = dfa_rhs.match(left);
 
     EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
@@ -40,6 +41,12 @@ std::optional<uint32_t> do_calculate(std::string_view left, std::string_view rig
     return std::nullopt;
 }
 
+std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold,
+                                     LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+{
+    return do_calculate(left, right, threshold, casing, dfa_type, LevenshteinDfa::Matching::FullString);
+}
+
 std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold,
                                      LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
 {
@@ -74,6 +81,16 @@ struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> {
         return do_calculate(left, right, threshold, casing(), dfa_type());
     }
 
+    static std::optional<uint32_t> prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+        // Prefix matching is not symmetric, cannot use do_calculate() as it implicitly checks this.
+        auto dfa = LevenshteinDfa::build(left, threshold, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix);
+        auto maybe_match = dfa.match(right);
+        if (maybe_match.matches()) {
+            return {maybe_match.edits()};
+        }
+        return std::nullopt;
+    }
+
 };
 
 // All the baseline DFA tests use lowercase only, so they should have the exact same outcome
@@ -98,6 +115,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) {
         EXPECT_EQ(calculate("abc", "ab", max),   std::optional{1}) << max;
         EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max;
         EXPECT_EQ(calculate("a", "", max),       std::optional{1}) << max;
+        EXPECT_EQ(calculate("", "", max),        std::optional{0}) << max;
     }
     EXPECT_EQ(calculate("bc", "abcd", 2),  std::optional{2});
     EXPECT_EQ(calculate("ab", "abcd", 2),  std::optional{2});
@@ -109,6 +127,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) {
     EXPECT_EQ(calculate("ab", "", 2),      std::optional{2});
     EXPECT_EQ(calculate("abc", "", 2),     std::nullopt);
     EXPECT_EQ(calculate("abc", "123", 2),  std::nullopt);
+    EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt);
 }
 
 TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
@@ -124,6 +143,37 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
     EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt);
 }
 
+TEST_P(LevenshteinDfaTest, can_match_in_target_string_prefix_mode) {
+    for (auto max : {1, 2}) {
+        EXPECT_EQ(prefix_calculate("",    "literally anything", max), std::optional{0});
+        EXPECT_EQ(prefix_calculate("",    "", max),        std::optional{0});
+        EXPECT_EQ(prefix_calculate("x",   "", max),        std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abc", max),     std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "abcd", max),    std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "abcdef", max),  std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "ab", max),      std::optional{1});
+        EXPECT_EQ(prefix_calculate("ac",  "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("acd", "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("bc",  "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "acb", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "acdefg", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("acb", "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abd", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xbc", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xy", max),      std::nullopt);
+    }
+    EXPECT_EQ(prefix_calculate("abc", "xxabc", 2),   std::optional{2});
+    EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2),  std::optional{2});
+    EXPECT_EQ(prefix_calculate("abcxx", "abc", 2),   std::optional{2});
+    EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2),  std::optional{2});
+    EXPECT_EQ(prefix_calculate("xy",  "", 2), std::optional{2});
+    EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt);
+}
+
 void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source,
                         std::string_view expected_successor, std::string_view successor_prefix)
 {
@@ -182,11 +232,22 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings
                             "\xf0\x9f\xa4\xa9""food");    // <starry eyed emoji>food
 
     // Note that as a general rule, emojis are fickle beasts to deal with since a single
-    // emoji often takes up multiple code points, which we consider separate characters
-    // but a user sees as a single actual rendered glyph.
+    // emoji often takes up multiple code points. We consider these as separate characters
+    // but from a Unicode perspective they are considered part of an "extended grapheme
+    // cluster", from which the actually rendered glyph is derived.
     // Multi-code point character edit distance support is left as an exercise for the reader :D
 }
 
+TEST_P(LevenshteinDfaTest, can_generate_successors_with_prefix_match_mode) {
+    auto dfa = LevenshteinDfa::build("ban", 1, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix);
+    // There is no difference at all in successor output when using Prefix mode as opposed to
+    // FullString mode, but since we match _more_ source strings in prefix mode we cannot
+    // simply run the same test set for both modes. This just tests that the wiring seems fine.
+    test_dfa_successor(dfa, "",      "\x01""an");
+    test_dfa_successor(dfa, "xy",    "yan");
+    test_dfa_successor(dfa, "bolle", "bon");
+}
+
 TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) {
     auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
     // The successor string must be lexicographically larger than the input string.
@@ -332,13 +393,21 @@ std::string bits_to_str(T v) {
     return ret;
 }
 
-using CasingAndDfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType, uint32_t>;
+using CasingAndDfaTypeAndMaxEdits = std::tuple<
+    LevenshteinDfa::Casing,
+    LevenshteinDfa::DfaType,
+    LevenshteinDfa::Matching,
+    uint32_t
+>;
 
 struct LevenshteinDfaSuccessorTest : TestWithParam<CasingAndDfaTypeAndMaxEdits> {
-    // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number.
+    // Print test suffix as e.g. "/Uncased_Explicit_FullString_1" instead of just a GTest-chosen number.
     static std::string stringify_params(const TestParamInfo<ParamType>& info) {
         std::ostringstream ss;
-        ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param);
+        ss << std::get<0>(info.param)
+           << '_' << std::get<1>(info.param)
+           << '_' << std::get<2>(info.param)
+           << '_' << std::get<3>(info.param);
         return ss.str();
     }
 };
@@ -350,6 +419,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
                                  Values(LevenshteinDfa::DfaType::Explicit,
                                         LevenshteinDfa::DfaType::Implicit,
                                         LevenshteinDfa::DfaType::Table),
+                                 Values(LevenshteinDfa::Matching::FullString,
+                                        LevenshteinDfa::Matching::Prefix),
                                  Values(1, 2)),
                          LevenshteinDfaSuccessorTest::stringify_params);
 
@@ -369,10 +440,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
  * Inspired by approach used by Lucene DFA exhaustive testing.
  */
 TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
-    const auto [casing, dfa_type, max_edits] = GetParam();
+    const auto [casing, dfa_type, matching, max_edits] = GetParam();
     for (uint32_t i = 0; i < 256; ++i) {
         const auto target = bits_to_str(static_cast<uint8_t>(i));
-        auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type);
+        auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type, matching);
         std::string skip_to, successor;
         for (uint32_t j = 0; j < 256; ++j) {
             const auto source = bits_to_str(static_cast<uint8_t>(j));
diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
index 37d1d4b4c54..918cbc624db 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
@@ -16,6 +16,13 @@ std::optional<uint32_t> calculate(std::string_view left, std::string_view right,
     return leftRight;
 }
 
+// Prefix matching is asymmetric and therefore cannot implicitly test result symmetry
+std::optional<uint32_t> prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+    auto left_codepoints  = vespalib::LowerCase::convert_to_ucs4(left);
+    auto right_codepoints = vespalib::LowerCase::convert_to_ucs4(right);
+    return vespalib::LevenshteinDistance::calculate(left_codepoints, right_codepoints, threshold, true);
+}
+
 TEST(LevenshteinDistance, calculate_edgecases) {
     EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0});
     EXPECT_EQ(calculate("abc", "ab1", 2), std::optional{1});
@@ -33,6 +40,54 @@ TEST(LevenshteinDistance, calculate_edgecases) {
     EXPECT_EQ(calculate("ab", "", 2), std::optional{2});
     EXPECT_EQ(calculate("abc", "", 2), std::nullopt);
     EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+    EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt);
+}
+
+TEST(LevenshteinDistance, prefix_match_edge_cases) {
+    // Same cases as LevenshteinDfaTest (TODO consolidate these somehow)
+    for (auto max : {1, 2}) {
+        EXPECT_EQ(prefix_calculate("",    "literally anything", max), std::optional{0});
+        EXPECT_EQ(prefix_calculate("",    "", max),        std::optional{0});
+        EXPECT_EQ(prefix_calculate("x",   "", max),        std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abc", max),     std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "abcd", max),    std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "abcdef", max),  std::optional{0});
+        EXPECT_EQ(prefix_calculate("abc", "ab", max),      std::optional{1});
+        EXPECT_EQ(prefix_calculate("ac",  "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("acd", "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("bc",  "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "acb", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "acdefg", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("acb", "abcdef", max),  std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abd", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xbc", max),     std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1});
+        EXPECT_EQ(prefix_calculate("abc", "xy", max),      std::nullopt);
+    }
+    EXPECT_EQ(prefix_calculate("abc", "xxabc", 2),   std::optional{2});
+    EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2),  std::optional{2});
+    EXPECT_EQ(prefix_calculate("abcxx", "abc", 2),   std::optional{2});
+    EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2),  std::optional{2});
+    EXPECT_EQ(prefix_calculate("xy",  "", 2), std::optional{2});
+    EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt);
+
+    // Max edits > 2 cases; not supported by DFA implementation.
+    EXPECT_EQ(prefix_calculate("abc", "", 3),      std::optional{3});
+    EXPECT_EQ(prefix_calculate("abc", "xy", 3),    std::optional{3});
+    EXPECT_EQ(prefix_calculate("abc", "xyz", 3),   std::optional{3});
+    EXPECT_EQ(prefix_calculate("abc", "xyzzz", 3), std::optional{3});
+    EXPECT_EQ(prefix_calculate("abcd", "xyzd", 3), std::optional{3});
+    EXPECT_EQ(prefix_calculate("abcd", "xyzz", 3), std::nullopt);
+    EXPECT_EQ(prefix_calculate("abcd", "", 3),     std::nullopt);
+}
+
+TEST(LevenshteinDistance, oversized_max_edits_is_well_defined) {
+    const auto k = uint32_t(INT32_MAX) + 10000u;
+    EXPECT_EQ(calculate("abc", "xyz", k), std::optional{3});
+    EXPECT_EQ(prefix_calculate("abc", "xyzzzz", k), std::optional{3});
 }
 
 GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
index c69f414aae6..8b2597dd179 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
@@ -20,6 +20,12 @@ concept DfaMatcher = requires(T a, std::string u8str, std::vector<uint32_t> u32s
     // matching to have the expected semantics, the actual target string must be pre-lowercased.
     { a.is_cased() } -> std::same_as<bool>;
 
+    // Whether the matching is performed in prefix mode. In prefix mode, a source string is
+    // considered a match if it matches the target string at any point during DFA traversal.
+    // I.e. the source string "bananas" prefix-matched against the target (prefix) "ban" will
+    // be returned as a match with 0 edits.
+    { a.is_prefix() } -> std::same_as<bool>;
+
     // Initial (starting) state of the DFA
     { a.start() } -> std::same_as<typename T::StateType>;
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
index 490582b5bf7..a3542d61dc8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -97,9 +97,11 @@ public:
 private:
     std::vector<DfaNodeType> _nodes;
     const bool               _is_cased;
+    const bool               _is_prefix;
 public:
-    explicit ExplicitLevenshteinDfaImpl(bool is_cased) noexcept
-        : _is_cased(is_cased)
+    ExplicitLevenshteinDfaImpl(bool is_cased, bool is_prefix) noexcept
+        : _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {}
     ~ExplicitLevenshteinDfaImpl() override;
 
@@ -140,10 +142,12 @@ template <typename Traits>
 class ExplicitLevenshteinDfaBuilder {
     const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
     const bool                  _is_cased;
+    const bool                  _is_prefix;
 public:
-    ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str, bool is_cased) noexcept
+    ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str, bool is_cased, bool is_prefix) noexcept
         : _u32_str_buf(std::move(str)),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {}
 
     [[nodiscard]] LevenshteinDfa build_dfa() const;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
index 55dd459ff26..11a822d2d7e 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -22,16 +22,20 @@ struct ExplicitDfaMatcher {
 
     const std::span<const DfaNodeType> _nodes;
     const bool                         _is_cased;
+    const bool                         _is_prefix;
 
-    ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes, bool is_cased) noexcept
+    ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes, bool is_cased, bool is_prefix) noexcept
         : _nodes(nodes),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {}
 
     static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
 
     bool is_cased() const noexcept { return _is_cased; }
 
+    bool is_prefix() const noexcept { return _is_prefix; }
+
     StateType start() const noexcept {
         return &_nodes[0];
     }
@@ -99,21 +103,21 @@ ExplicitLevenshteinDfaImpl<MaxEdits>::~ExplicitLevenshteinDfaImpl() = default;
 template <uint8_t MaxEdits>
 LevenshteinDfa::MatchResult
 ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str) const {
-    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
+    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix);
     return MatchAlgorithm<MaxEdits>::match(matcher, u8str);
 }
 
 template <uint8_t MaxEdits>
 LevenshteinDfa::MatchResult
 ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string& successor_out) const {
-    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
+    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix);
     return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
 }
 
 template <uint8_t MaxEdits>
 LevenshteinDfa::MatchResult
 ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
-    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
+    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix);
     return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
 }
 
@@ -199,10 +203,12 @@ class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase<Traits> {
     using Base::transitions;
 
     const bool _is_cased;
+    const bool _is_prefix;
 public:
-    ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str, bool is_cased) noexcept
+    ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str, bool is_cased, bool is_prefix) noexcept
         : DfaSteppingBase<Traits>(str),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {
         assert(str.size() < UINT32_MAX / max_out_edges_per_node());
     }
@@ -217,7 +223,7 @@ public:
 
 template <typename Traits>
 LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
-    auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(_is_cased);
+    auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(_is_cased, _is_prefix);
     ExploreState<StateType> exp;
     // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_
     // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth.
@@ -257,7 +263,7 @@ LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
 
 template <typename Traits>
 LevenshteinDfa ExplicitLevenshteinDfaBuilder<Traits>::build_dfa() const {
-    ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf, _is_cased);
+    ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf, _is_cased, _is_prefix);
     return builder.build_dfa();
 }
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
index 20c8e1b7e2b..552fd849734 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -13,14 +13,16 @@ class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl {
     std::string                 _target_as_utf8;
     std::vector<uint32_t>       _target_utf8_char_offsets;
     const bool                  _is_cased;
+    const bool                  _is_prefix;
 public:
     using MatchResult = LevenshteinDfa::MatchResult;
 
-    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased)
+    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix)
         : _u32_str_buf(std::move(str)),
           _target_as_utf8(),
           _target_utf8_char_offsets(),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {
         precompute_utf8_target_with_offsets();
     }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
index 05ef2761f34..7dadcc59b8e 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -32,21 +32,26 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
     std::span<const char>     _target_as_utf8;
     std::span<const uint32_t> _target_utf8_char_offsets;
     const bool                _is_cased;
+    const bool                _is_prefix;
 
     ImplicitDfaMatcher(std::span<const uint32_t> u32_str,
                        std::span<const char>     target_as_utf8,
                        std::span<const uint32_t> target_utf8_char_offsets,
-                       bool is_cased) noexcept
+                       bool is_cased,
+                       bool is_prefix) noexcept
         : Base(u32_str),
           _target_as_utf8(target_as_utf8),
           _target_utf8_char_offsets(target_utf8_char_offsets),
-          _is_cased(is_cased)
+          _is_cased(is_cased),
+          _is_prefix(is_prefix)
     {}
 
     // start, is_match, can_match, match_edit_distance are all provided by base type
 
     bool is_cased() const noexcept { return _is_cased; }
 
+    bool is_prefix() const noexcept { return _is_prefix; }
+
     template <typename F>
     bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) {
         for (uint32_t i = 0; i < state.size(); ++i) {
@@ -137,21 +142,21 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
 template <typename Traits>
 LevenshteinDfa::MatchResult
 ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str);
 }
 
 template <typename Traits>
 LevenshteinDfa::MatchResult
 ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string& successor_out) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
 }
 
 template <typename Traits>
 LevenshteinDfa::MatchResult
 ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
 }
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index 19c2fffbb3e..468619c8036 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -41,34 +41,40 @@ void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const {
     _impl->dump_as_graphviz(out);
 }
 
-LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) {
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits,
+                                     Casing casing, DfaType dfa_type, Matching matching) {
     if (max_edits != 1 && max_edits != 2) {
         throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits));
     }
-    const bool is_cased = (casing == Casing::Cased);
+    const bool is_cased    = (casing == Casing::Cased);
+    const bool is_prefix   = (matching == Matching::Prefix);
     auto target_string_u32 = is_cased ? utf8_string_to_utf32(target_string)
                                       : utf8_string_to_utf32_lowercased(target_string);
     if (dfa_type == DfaType::Implicit) {
         if (max_edits == 1) {
-            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased));
+            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased, is_prefix));
         } else { // max_edits == 2
-            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased));
+            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased, is_prefix));
         }
     } else if(dfa_type == DfaType::Explicit) {
         if (max_edits == 1) {
-            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa();
+            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased, is_prefix).build_dfa();
         } else { // max_edits == 2
-            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa();
+            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased, is_prefix).build_dfa();
         }
     } else { // DfaType::Table
         if (max_edits == 1) {
-            return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased));
+            return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased, is_prefix));
         } else { // max_edits == 2
-            return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased));
+            return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased, is_prefix));
         }
     }
 }
 
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) {
+    return build(target_string, max_edits, casing, dfa_type, Matching::FullString);
+}
+
 LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) {
     // TODO automatically select implementation based on target length/max edits?
     //  Suggestion:
@@ -112,4 +118,12 @@ std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c) {
     return os;
 }
 
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Matching m) {
+    switch (m) {
+    case LevenshteinDfa::Matching::FullString: os << "FullString"; return os;
+    case LevenshteinDfa::Matching::Prefix:     os << "Prefix"; return os;
+    }
+    abort();
+}
+
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index 44c62bdb957..feace39b313 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -265,6 +265,33 @@ public:
         Cased
     };
 
+    enum class Matching {
+        /**
+         * Edit distance is computed based on the _entire_ source string. Matching is
+         * symmetric between source and target strings, i.e. match(x, y) and match(y, x)
+         * will yield the same result.
+         */
+        FullString,
+        /**
+         * Edit distance is computed based on a _prefix_ of the source string, as compared
+         * against the target string. Matching is therefore _asymmetric_ between source and
+         * target strings.
+         *
+         * Example of matching source strings against the target 'ban' (i.e. the prefix query)
+         * and 1 max edit distance:
+         *   'bananas'  - 0 edits (source prefix 'ban' exact-matches 'ban')
+         *   'balloons' - 1 edit ('bal' vs 'ban')
+         *   '2bananas' - 1 edit ('2ban' vs 'ban')
+         *   'boonanas' - mismatch (2 edits)
+         *
+         * Note that Prefix matching will match a lot more strings than FullString, so in
+         * practice it should be combined with prefix _locking_ to constrain the candidate
+         * result set to a more reasonable cardinality. In particular, max edits >= |target|
+         * will match _every_ source string trivially.
+         */
+        Prefix
+    };
+
     /**
      * Builds and returns a Levenshtein DFA that matches all strings within `max_edits`
      * edits of `target_string`. The type of DFA returned is specified by dfa_type.
@@ -273,6 +300,9 @@ public:
      *
      * `target_string` must not contain any null UTF-8 chars.
      */
+    [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits,
+                                              Casing casing, DfaType dfa_type, Matching matching);
+
     [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits,
                                               Casing casing, DfaType dfa_type);
 
@@ -301,5 +331,6 @@ public:
 std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos);
 std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt);
 std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c);
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Matching m);
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
index 4658635d880..f5b8d0fc0b8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
@@ -3,31 +3,46 @@
 
 #include "levenshtein_distance.h"
 
+#include <cassert>
 #include <limits>
 #include <vector>
 
+namespace vespalib {
+
 std::optional<uint32_t>
-vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold)
+LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right,
+                               uint32_t threshold, bool prefix_match)
 {
+    assert(left.size() <= static_cast<size_t>(INT32_MAX));
+    assert(right.size() <= static_cast<size_t>(INT32_MAX));
+    threshold = std::min(threshold, static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
     uint32_t n = left.size();
     uint32_t m = right.size();
 
-    if (n > m) {
-        return calculate(right, left, threshold);
-    }
-
-    // if one string is empty, the edit distance is necessarily the length
-    // of the other
-    if (n == 0) {
-        return m <= threshold ? std::optional(m) : std::nullopt;
-    }
-    if (m == 0) {
-        return n <= threshold ? std::optional(n) : std::nullopt;
-    }
-
-    // the edit distance cannot be less than the length difference
-    if (m - n > threshold) {
-        return std::nullopt;
+    if (!prefix_match) {
+        // These optimizations are only valid when matching with target/source string symmetry.
+        // Correctness of the main matrix calculation loop should not depend on these.
+        if (n > m) {
+            return calculate(right, left, threshold, false);
+        }
+        // if one string is empty, the edit distance is necessarily the length
+        // of the other.
+        if (n == 0) {
+            return m <= threshold ? std::optional(m) : std::nullopt;
+        }
+        if (m == 0) {
+            return n <= threshold ? std::optional(n) : std::nullopt;
+        }
+        // the edit distance cannot be less than the length difference
+        if (m - n > threshold) {
+            return std::nullopt;
+        }
+    } else {
+        // A source (right) cannot be transformed into a target prefix (left) if doing
+        // so would require inserting more than max edits number of characters.
+        if ((n > m) && (n - m > threshold)) {
+            return std::nullopt;
+        }
     }
 
     std::vector<uint32_t> p(n+1); // 'previous' cost array, horizontally
@@ -48,6 +63,7 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp
     }
 
     // iterates through t
+    uint32_t min_edits = n; // prefix matching: worst-case to transform to target
     for (uint32_t j = 1; j <= m; ++j) {
         uint32_t rightJ = right[j - 1]; // jth character of right
         d[0] = j;
@@ -56,9 +72,9 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp
 
         uint32_t max = j > std::numeric_limits<uint32_t>::max() - threshold ?
                        n : std::min(n, j + threshold);
-
         // ignore entry left of leftmost
         if (min > 1) {
+            assert(static_cast<size_t>(min) <= d.size());
             d[min - 1] = std::numeric_limits<uint32_t>::max();
         }
 
@@ -76,12 +92,35 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp
             lowerBound = std::min(lowerBound, d[i]);
         }
         if (lowerBound > threshold) {
-            return std::nullopt;
+            if (!prefix_match) {
+                return std::nullopt; // Cannot match
+            } else {
+                break; // May already have matched via min_edits
+            }
         }
         std::swap(p, d);
+        // For prefix matching:
+        // By definition, the Levenshtein matrix cell at row `i`, column `j`
+        // provides the minimum number of edits required to transform a prefix of
+        // source string S (up to and including length `i`) into a prefix of target
+        // string T (up to and including length `j`). Since we want to match against
+        // the entire target (prefix query) string with length `n`, the problem is
+        // reduced to finding the minimum value of the `n`th column that is `<= k`
+        // (aggregated here and checked after the loop).
+        min_edits = std::min(p[n], min_edits);
     }
-    if (p[n] <= threshold) {
+    if (prefix_match) {
+        return ((min_edits <= threshold) ? std::optional<uint32_t>{min_edits} : std::nullopt);
+    } else if (p[n] <= threshold) {
         return {p[n]};
     }
     return std::nullopt;
 }
+
+std::optional<uint32_t>
+LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold)
+{
+    return calculate(left, right, threshold, false);
+}
+
+} // vespalib
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
index f105dcdaaf4..e284e071731 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
@@ -19,7 +19,15 @@ namespace vespalib {
  */
 class LevenshteinDistance {
 public:
-    static std::optional<uint32_t> calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold);
+    // Iff `prefix_match` == true, `right` is the candidate to match against prefix `left`
+    static std::optional<uint32_t> calculate(std::span<const uint32_t> left,
+                                             std::span<const uint32_t> right,
+                                             uint32_t threshold,
+                                             bool prefix_match);
+
+    static std::optional<uint32_t> calculate(std::span<const uint32_t> left,
+                                             std::span<const uint32_t> right,
+                                             uint32_t threshold);
 };
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index 388099a0358..e8b1bb5b415 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -25,11 +25,14 @@ struct MatchAlgorithm {
     static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
 
     /**
-     * Matches UTF-8 source string `source` against the target DFA, optionally generating
+     * Matches UTF-8/32 source string `source` against the target DFA, optionally generating
      * the successor string iff the source string is not within the maximum number of edits
      * of the target string.
      *
-     * The actual match loop is very simple: we try to match the DFA as far as we can
+     * The actual match loop is very simple, but with some subtle differences in semantics
+     * depending on whether the "full string" or "prefix" matching mode is used.
+     *
+     * For the full string matching mode, we try to match the DFA as far as we can
      * before either consuming all input (source string) characters or ending up in a non-
      * matching state before we have consumed all input. In the former case, we may be in
      * a matching state (consider matching "foo" with the target string "food"; after
@@ -39,6 +42,15 @@ struct MatchAlgorithm {
      * If we end up in a matching state, all is well. We simply return a MatchResult with
      * the number of edits the state represents.
      *
+     * Prefix matching is very similar, but since we're interested in the number of edits
+     * required to transform a _prefix_ of the source into the target string, we cannot
+     * require matching the entire source string in the case that it is longer than the
+     * target (as that would not just be a prefix, after all). We consider a source prefix
+     * to be matched if _any_ DFA state is a match. This may be the initial state.
+     * Since the first match might not yield the minimum edit distance, we traverse the
+     * DFA until it no longer matches and return the smallest encountered distance as the
+     * final result.
+     *
      * The interesting bit happens the string does _not_ match and we are asked to provide a
      * _successor_ string that _does_ match and is strictly greater in lexicographic order.
      *
@@ -143,6 +155,18 @@ struct MatchAlgorithm {
      * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
      * ordering when compared byte-wise against an implicitly lowercased dictionary.
      *
+     * The successor generation algorithm generalizes without changes to work as expected
+     * when prefix matching is used. This is because the target string represents the
+     * prefix and the successor consequently represents the smallest possible greater prefix.
+     *
+     * Proof by contradiction: S is a source string that does _not_ prefix match target
+     * string T, and S' is the returned successor. Assume that discarding (skipping) all
+     * candidate source strings C with S < C < S' misses at least one such candidate C'
+     * that has a prefix matching T. This means that it must be possible to traverse the
+     * DFA with source string C' and end up in a matching state. But by definition the
+     * successor S' is the smallest possible lexicographically greater string that can
+     * possibly match the DFA for T; a contradiction.
+     *
      * TODO let matcher know if source string is pre-normalized (i.e. lowercased).
      */
     template <DfaMatcher Matcher, typename SuccessorT>
@@ -158,6 +182,9 @@ struct MatchAlgorithm {
 
         StateType state = matcher.start();
         while (u8_reader.hasMore()) {
+            if (matcher.is_prefix() && matcher.is_match(state)) {
+                return minimal_matching_prefix_distance(matcher, state, u8_reader);
+            }
             const auto pos_before_char = static_cast<uint32_t>(successor_out.size());
             const uint32_t raw_mch     = u8_reader.getChar();
             const uint32_t mch         = normalized_match_char(raw_mch, matcher.is_cased());
@@ -198,6 +225,9 @@ struct MatchAlgorithm {
         Utf8Reader u8_reader(source.data(), source.size());
         StateType state = matcher.start();
         while (u8_reader.hasMore()) {
+            if (matcher.is_prefix() && matcher.is_match(state)) {
+                return minimal_matching_prefix_distance(matcher, state, u8_reader);
+            }
             const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased());
             auto maybe_next    = matcher.match_input(state, mch);
             if (matcher.can_match(maybe_next)) {
@@ -213,6 +243,41 @@ struct MatchAlgorithm {
         return MatchResult::make_mismatch(max_edits());
     }
 
+    /**
+     * Follows the DFA (in an already matching state) until it no longer matches, keeping
+     * track of the minimum edit distance encountered. This is used to compute the minimum
+     * number of edits during prefix matching, as just returning the edit distance of the
+     * _first_ matching state may not result in the minimum.
+     *
+     * Example 1: matching the source string 'bananas' against the prefix target 'ban' with
+     * max edits 2 will be in a matching state already after consuming 'b' (can append 2
+     * characters after), but had we processed the remaining characters we would end up with
+     * the actual prefix edit distance of 0.
+     *
+     * Example 2: matching the source string 'abcdef' against target 'acd' with max edits 2
+     * will be in a matching state after 'a' (2 edits), but the minimal edit prefix is 'abcd'
+     * (1 edit). This demonstrates that we may have to process more than |target| number of
+     * source characters to get a minimal distance.
+     */
+    template <DfaMatcher Matcher>
+    static MatchResult minimal_matching_prefix_distance(const Matcher& matcher,
+                                                        typename Matcher::StateType state,
+                                                        Utf8Reader& u8_reader)
+    {
+        auto min_edits = matcher.match_edit_distance(state);
+        while (u8_reader.hasMore()) {
+            const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased());
+            auto maybe_next    = matcher.match_input(state, mch);
+            if (matcher.can_match(maybe_next)) {
+                state = maybe_next;
+                min_edits = std::min(min_edits, matcher.match_edit_distance(state));
+            } else {
+                break;
+            }
+        }
+        return MatchResult::make_match(max_edits(), min_edits);
+    }
+
     /**
      * Instantly backtrack to the last possible branching point in the DFA where we can
      * choose some higher outgoing edge character value and still match the DFA. If the node
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
index 4ad203c9a46..a15f321f32d 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
@@ -46,12 +46,13 @@ public:
 private:
     const std::vector<Lookup> _lookup;
     const bool                _is_cased;
+    const bool                _is_prefix;
 
     static std::vector<Lookup> make_lookup(const std::vector<uint32_t> &str);
     
 public:
     using MatchResult = LevenshteinDfa::MatchResult;
-    TableDfa(std::vector<uint32_t> str, bool is_cased);
+    TableDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix);
     ~TableDfa() override;
     [[nodiscard]] MatchResult match(std::string_view source) const override;
     [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
index e7c721f644e..721cfe296d2 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
@@ -353,11 +353,13 @@ struct TableMatcher {
     const TableDfa<N>::Lookup *lookup;
     const uint32_t             end;
     const bool                 cased;
+    const bool                 prefix;
 
-    TableMatcher(const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in)
-      noexcept : lookup(lookup_in), end(end_in), cased(cased_in) {}
+    TableMatcher(const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in, bool prefix_in)
+      noexcept : lookup(lookup_in), end(end_in), cased(cased_in), prefix(prefix_in) {}
     
     bool is_cased() const noexcept { return cased; }
+    bool is_prefix() const noexcept { return prefix; }
     static constexpr S start() noexcept { return S::start(); }
 
     uint8_t match_edit_distance(S s) const noexcept { return s.edits(end); }
@@ -473,9 +475,10 @@ TableDfa<N>::make_lookup(const std::vector<uint32_t> &str)->std::vector<Lookup>
 }
 
 template <uint8_t N>
-TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased)
+TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix)
   : _lookup(make_lookup(str)),
-    _is_cased(is_cased)
+    _is_cased(is_cased),
+    _is_prefix(is_prefix)
 {
 }
 
@@ -486,7 +489,7 @@ template <uint8_t N>
 LevenshteinDfa::MatchResult
 TableDfa<N>::match(std::string_view u8str) const
 {
-    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix);
     return MatchAlgorithm<N>::match(matcher, u8str);
 }
 
@@ -494,7 +497,7 @@ template <uint8_t N>
 LevenshteinDfa::MatchResult
 TableDfa<N>::match(std::string_view u8str, std::string& successor_out) const
 {
-    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix);
     return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
 }
 
@@ -502,7 +505,7 @@ template <uint8_t N>
 LevenshteinDfa::MatchResult
 TableDfa<N>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const
 {
-    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+    TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix);
     return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
 }
 
-- 
cgit v1.2.3