summaryrefslogtreecommitdiffstats
path: root/vespalib/src
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2023-05-16 12:08:25 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2023-07-18 09:31:48 +0000
commitf3229403240a180260acf9c2b9dd598513811ddb (patch)
treea00c920bd99b51d0b450da448e9185b43ff33e13 /vespalib/src
parent1a7da42f40f7f7c664380fa646b048f8ee4e3523 (diff)
Implement Levenshtein DFAs with successor string generation
This implements code for building and evaluating Levenshtein Deterministic Finite Automata, where the resulting DFA efficiently matches all possible source strings that can be transformed to the target string within k max edits. This allows for O(n) matching of strings. We currently support k in {1, 2}. Additionally, when matching using a DFA, in the case where the source string does _not_ match, we can generate the _successor_ string; the next matching string that is lexicographically _greater_ than the source string. This string has the invariant that there are no possibly matching strings within k edits ordered after the source string but before the successor. This lets us do possibly massive leaps forward in an ordered dictionary, turning a scan for matches into a sublinear operation. Matching and successor generation is fully Unicode-aware. All input strings are expected to be in UTF-8 (without nulls), and the generated successor is also encoded as UTF-8. Internally, matching is done on UTF-32 code points and the DFA itself is built around UTF-32. UTF-8 decoding of source strings is done in a streaming fashion and does not require any allocations. This commit includes a templated core Levenshtein DFA matching (and successor generation) algorithm and two separate DFA implementations that can be used; one explicit and one implicit. The explicit DFA is an immutable DAG built up-front that represents all DFA states and transitions as explicit nodes and edges in a graph. This is currently the fastest to evaluate, but the build time and memory usage means its usage should be preferred for shorter strings (up to a few hundred chars). The implicit DFA does not build any graph up-front, but rather evaluates state transitions on-demand for any given source string. This is currently slower than the explicit DFA, but its O(1) memory usage (aside from the memory used by the target string itself) means that it can be used for arbitrary string lengths. This code currently exists as a freestanding vespalib utility, and is not yet wired to any production code (fuzzy matching or similar). Future optimizations: * Redesign sparse state representation and stepping logic to be much less branching, in turn making the code much less likely to stall the CPU pipeline. * Emit as much as possible of the successor string suffix by copying directly from the target string UTF-8 representation instead of following the DFA and encoding UTF-32 to UTF-8 chars.
Diffstat (limited to 'vespalib/src')
-rw-r--r--vespalib/src/tests/fuzzy/CMakeLists.txt9
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp507
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt10
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h70
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h299
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg286
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp11
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h147
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp228
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp9
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h35
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp121
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp83
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h244
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp291
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/sparse_state.h175
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp108
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h33
18 files changed, 2663 insertions, 3 deletions
diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt
index bc48e775711..00a89d0a604 100644
--- a/vespalib/src/tests/fuzzy/CMakeLists.txt
+++ b/vespalib/src/tests/fuzzy/CMakeLists.txt
@@ -16,3 +16,12 @@ vespa_add_executable(vespalib_levenshtein_distance_test_app TEST
GTest::GTest
)
vespa_add_test(NAME vespalib_levenshtein_distance_test_app COMMAND vespalib_levenshtein_distance_test_app)
+
+vespa_add_executable(vespalib_levenshtein_dfa_test_app TEST
+ SOURCES
+ levenshtein_dfa_test.cpp
+ DEPENDS
+ vespalib
+ GTest::GTest
+)
+vespa_add_test(NAME vespalib_levenshtein_dfa_test_app COMMAND vespalib_levenshtein_dfa_test_app)
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
new file mode 100644
index 00000000000..6966fd0b703
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -0,0 +1,507 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
+#include <vespa/vespalib/fuzzy/dfa_stepping_base.h>
+#include <vespa/vespalib/fuzzy/unicode_utils.h>
+#include <vespa/vespalib/fuzzy/levenshtein_distance.h> // For benchmarking purposes
+#include <vespa/vespalib/util/benchmark_timer.h>
+#include <charconv>
+#include <concepts>
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <string_view>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using namespace vespalib::fuzzy;
+namespace fs = std::filesystem;
+
+static std::string benchmark_dictionary;
+
+struct LevenshteinDfaTest : TestWithParam<LevenshteinDfa::DfaType> {
+
+ static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); }
+
+ static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+ auto dfa_lhs = LevenshteinDfa::build(left, threshold, dfa_type());
+ auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
+
+ auto dfa_rhs = LevenshteinDfa::build(right, threshold, dfa_type());
+ auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
+
+ EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
+ if (maybe_match_lhs.matches()) {
+ EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits());
+ return {maybe_match_lhs.edits()};
+ }
+ return std::nullopt;
+ }
+
+ static std::optional<uint32_t> calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold) {
+ std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size());
+ std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size());
+ return calculate(lhs_ch, rhs_ch, threshold);
+ }
+
+};
+
+INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+ LevenshteinDfaTest,
+ Values(LevenshteinDfa::DfaType::Explicit,
+ LevenshteinDfa::DfaType::Implicit),
+ PrintToStringParamName());
+
+// Same as existing non-DFA Levenshtein tests, but with some added instantiations
+// for smaller max distances.
+TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) {
+ EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0});
+ for (auto max : {1, 2}) {
+ EXPECT_EQ(calculate("abc", "ab1", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("abc", "1bc", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("abc", "a1c", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("abc", "ab", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("a", "", max), std::optional{1}) << max;
+ }
+ EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("cd", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("ad", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "a12", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+ EXPECT_EQ(calculate("ab", "", 1), std::nullopt);
+ EXPECT_EQ(calculate("ab", "", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "", 2), std::nullopt);
+ EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+}
+
+TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
+ // Each hiragana/katakana/kanji corresponds to multiple (3) UTF-8 chars but a single UTF-32 code point.
+ EXPECT_EQ(calculate(u8"猫", u8"猫", 2), std::optional{0});
+ EXPECT_EQ(calculate(u8"猫", u8"犬", 2), std::optional{1});
+ EXPECT_EQ(calculate(u8"猫と犬", u8"犬と猫", 2), std::optional{2});
+ EXPECT_EQ(calculate(u8"猫は好き", u8"犬が好き", 2), std::optional{2});
+ EXPECT_EQ(calculate(u8"カラオケ", u8"カラオケ", 2), std::optional{0});
+ EXPECT_EQ(calculate(u8"カラオケ", u8"カラoケ", 2), std::optional{1});
+ EXPECT_EQ(calculate(u8"カラオケ", u8"カraオケ", 2), std::optional{2});
+ EXPECT_EQ(calculate(u8"kaラオケ", u8"カラオケ", 2), std::optional{2});
+ EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt);
+}
+
+void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) {
+ std::string successor;
+ auto m = dfa.match(source, &successor);
+ if (m.matches()) {
+ FAIL() << "Expected '" << source << "' to emit a successor, but it "
+ << "matched with " << static_cast<uint32_t>(m.edits())
+ << " edits (of max " << static_cast<uint32_t>(m.max_edits()) << " edits)";
+ }
+ EXPECT_EQ(successor, expected_successor);
+ EXPECT_TRUE(dfa.match(successor, nullptr).matches());
+}
+
+TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) {
+ auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+
+ test_dfa_successor(dfa, "", "\x01""food");
+ test_dfa_successor(dfa, "faa", "faod");
+ test_dfa_successor(dfa, "fooooo", "foop");
+ test_dfa_successor(dfa, "ooof", "pfood");
+ test_dfa_successor(dfa, "fo", "fo\x01""d");
+ test_dfa_successor(dfa, "oo", "ood");
+ test_dfa_successor(dfa, "ooo", "oood");
+ test_dfa_successor(dfa, "foh", "fohd");
+ test_dfa_successor(dfa, "foho", "fohod");
+ test_dfa_successor(dfa, "foxx", "foyd");
+ test_dfa_successor(dfa, "xfa", "xfood");
+ test_dfa_successor(dfa, "gg", "good");
+ test_dfa_successor(dfa, "gp", "hfood");
+ test_dfa_successor(dfa, "ep", "f\x01""od");
+ test_dfa_successor(dfa, "hfoodz", "hood");
+ test_dfa_successor(dfa, "aooodz", "bfood");
+
+ // Also works with Unicode
+ // 2 chars
+ test_dfa_successor(dfa, "\xc3\x86""x", // "Æx"
+ "\xc3\x87""food"); // "Çfood"
+ // 3 chars
+ test_dfa_successor(dfa, "\xe7\x8c\xab""\xe3\x81\xaf", // "猫は"
+ "\xe7\x8c\xac""food"); // "猬food"
+ // 4 chars
+ test_dfa_successor(dfa, "\xf0\x9f\xa4\xa9""abc", // <starry eyed emoji>abc
+ "\xf0\x9f\xa4\xa9""food"); // <starry eyed emoji>food
+
+ // Note that as a general rule, emojis are fickle beasts to deal with since a single
+ // emoji often takes up multiple code points, which we consider separate characters
+ // but a user sees as a single actual rendered glyph.
+ // Multi-code point character edit distance support is left as an exercise for the reader :D
+}
+
+TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) {
+ auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+ // The successor string must be lexicographically larger than the input string.
+ // In the presence of a wildcard output edge we handle this by increase the input
+ // character by 1 and encoding it back as UTF-8.
+ // It is possible (though arguably very unlikely) that the input character is
+ // U+10FFFF, which is the maximum valid Unicode character. We have to ensure that
+ // we can encode U+10FFFF + 1, even though it's technically outside the valid range.
+ // Luckily, UTF-8 can technically (there's that word again) encode up to U+1FFFFF,
+ // so the resulting string is byte-wise greater, and that's what matters since we
+ // don't guarantee that the successor string is _valid_ UTF-8.
+ // This problem does not happen with the target string, as it's an invalid character
+ // and will be replaced with the Unicode replacement char before we ever see it.
+ test_dfa_successor(dfa, "\xf4\x8f\xbf\xbf""xyz", // U+10FFFF
+ "\xf4\x90\x80\x80""food");// U+10FFFF+1
+}
+
+TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) {
+ auto dfa = LevenshteinDfa::build("", 1, dfa_type());
+ test_dfa_successor(dfa, "aa", "b");
+ test_dfa_successor(dfa, "b\x01", "c");
+ test_dfa_successor(dfa, "vespa", "w");
+}
+
+// We should normally be able to rely on higher-level components to ensure we
+// only receive valid UTF-8, but make sure we don't choke on it if we do get it.
+TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) {
+ // 0xff is not a valid encoding and is implicitly converted to U+FFFD,
+ // which is the standard Unicode replacement character.
+ EXPECT_EQ(calculate("\xff", "a", 2), std::optional{1});
+ EXPECT_EQ(calculate("\xff\xff", "a", 2), std::optional{2});
+ EXPECT_EQ(calculate("a", "\xff", 2), std::optional{1});
+ EXPECT_EQ(calculate("a", "\xff\xff\xff", 2), std::nullopt);
+ EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0});
+}
+
+TEST_P(LevenshteinDfaTest, unsupported_max_edits_value_throws) {
+ EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, dfa_type()), std::invalid_argument);
+ EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, dfa_type()), std::invalid_argument);
+}
+
+// Turn integer v into its bitwise string representation with the MSB as the leftmost character.
+template <std::unsigned_integral T>
+std::string bits_to_str(T v) {
+ constexpr const uint8_t n_bits = sizeof(T) * 8;
+ std::string ret(n_bits, '0');
+ for (uint8_t bit = 0; bit < n_bits; ++bit) {
+ if (v & (1 << bit)) {
+ ret[n_bits - bit - 1] = '1';
+ }
+ }
+ return ret;
+}
+
+using DfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::DfaType, uint32_t>;
+
+struct LevenshteinDfaSuccessorTest : TestWithParam<DfaTypeAndMaxEdits> {
+ // Print test suffix as e.g. "/Explicit_1" instead of just a GTest-chosen number.
+ static std::string stringify_params(const TestParamInfo<ParamType>& info) {
+ std::ostringstream ss;
+ ss << std::get<0>(info.param) << '_' << std::get<1>(info.param);
+ return ss.str();
+ }
+};
+
+INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
+ LevenshteinDfaSuccessorTest,
+ Combine(Values(LevenshteinDfa::DfaType::Explicit,
+ LevenshteinDfa::DfaType::Implicit),
+ Values(1, 2)),
+ LevenshteinDfaSuccessorTest::stringify_params);
+
+/**
+ * Exhaustively test successor generation by matching all target and source strings
+ * in {0,1}^8 against each other. Since we generate bit strings identical to the
+ * bit patterns of the underlying counter(s), any string at index `i+1` will compare
+ * lexicographically greater than the one at `i`. We use this to test that we never
+ * miss a valid match that comes between a mismatch and its generated successor.
+ *
+ * For each mismatch we note the successor it emitted. Verify that each subsequent
+ * match() invocation for a source string < the successor results in a mismatch.
+ *
+ * We test this for both max edit distance 1 and 2. Despite being an exhaustive test,
+ * this completes in a few dozen milliseconds even with ASan instrumentation.
+ *
+ * Inspired by approach used by Lucene DFA exhaustive testing.
+ */
+TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
+ const auto [dfa_type, max_edits] = GetParam();
+ for (uint32_t i = 0; i < 256; ++i) {
+ const auto target = bits_to_str(static_cast<uint8_t>(i));
+ auto target_dfa = LevenshteinDfa::build(target, max_edits, dfa_type);
+ std::string skip_to, successor;
+ for (uint32_t j = 0; j < 256; ++j) {
+ const auto source = bits_to_str(static_cast<uint8_t>(j));
+ auto maybe_match = target_dfa.match(source, &successor);
+ if (maybe_match.matches() && !skip_to.empty()) {
+ ASSERT_GE(source, skip_to);
+ } else if (!maybe_match.matches()) {
+ ASSERT_FALSE(successor.empty()) << source;
+ ASSERT_GE(successor, skip_to) << source;
+ ASSERT_GT(successor, source) << source;
+ skip_to = successor;
+ }
+ }
+ }
+}
+
+namespace {
+
+template <uint8_t MaxEdits>
+void explore(const DfaSteppingBase<FixedMaxEditDistanceTraits<MaxEdits>>& stepper,
+ const typename DfaSteppingBase<FixedMaxEditDistanceTraits<MaxEdits>>::StateType& in_state)
+{
+ ASSERT_EQ(stepper.can_match(stepper.step(in_state, WILDCARD)),
+ stepper.can_wildcard_step(in_state));
+ if (!stepper.can_match(in_state)) {
+ return; // reached the end of the line
+ }
+ // DFS-explore all matching transitions, as well as one non-matching transition
+ auto t = stepper.transitions(in_state);
+ for (uint32_t c: t.u32_chars()) {
+ ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.step(in_state, c)));
+ }
+ ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.step(in_state, WILDCARD)));
+}
+
+} // anon ns
+
+using StateStepperTypes = Types<
+ DfaSteppingBase<FixedMaxEditDistanceTraits<1>>,
+ DfaSteppingBase<FixedMaxEditDistanceTraits<2>>
+>;
+
+template <typename SteppingBase>
+struct LevenshteinSparseStateTest : Test {};
+
+TYPED_TEST_SUITE(LevenshteinSparseStateTest, StateStepperTypes);
+
+// "Meta-test" for checking that the `can_wildcard_step` predicate function is
+// functionally equivalent to evaluating `can_match(stepper.step(in_state, WILDCARD))`
+TYPED_TEST(LevenshteinSparseStateTest, wildcard_step_predcate_is_equivalent_to_step_with_can_match) {
+ for (const char* target : {"", "a", "ab", "abc", "abcdef", "aaaaa"}) {
+ auto u32_target = utf8_string_to_utf32(target);
+ TypeParam stepper(u32_target);
+ ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.start()));
+ }
+}
+
+template <typename T>
+void do_not_optimize_away(T&& t) noexcept {
+ asm volatile("" : : "m"(t) : "memory"); // Clobber the value to avoid losing it to compiler optimizations
+}
+
+enum class BenchmarkType {
+ DfaExplicit,
+ DfaImplicit,
+ Legacy
+};
+
+const char* to_s(BenchmarkType t) noexcept {
+ // Note: need underscores since this is used as part of GTest-generated test instance names
+ switch (t) {
+ case BenchmarkType::DfaExplicit: return "DFA_explicit";
+ case BenchmarkType::DfaImplicit: return "DFA_implicit";
+ case BenchmarkType::Legacy: return "legacy";
+ }
+ abort();
+}
+
+[[nodiscard]] bool benchmarking_enabled() noexcept {
+ return !benchmark_dictionary.empty();
+}
+
+[[nodiscard]] std::vector<uint32_t> string_lengths() {
+ return {2, 8, 16, 64, 256, 1024, 1024*16, 1024*64};
+}
+
+struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
+
+ static std::string stringify_params(const TestParamInfo<ParamType>& info) {
+ return to_s(info.param);
+ }
+
+ void SetUp() override {
+ if (!benchmarking_enabled()) {
+ GTEST_SKIP() << "benchmarking not enabled";
+ }
+ }
+
+ static BenchmarkType benchmark_type() noexcept { return GetParam(); }
+
+ static const std::vector<std::string>& load_dictionary_once() {
+ static auto sorted_lines = read_and_sort_all_lines(fs::path(benchmark_dictionary));
+ return sorted_lines;
+ }
+
+ static std::vector<std::string> read_and_sort_all_lines(const fs::path& file_path) {
+ std::ifstream ifs(file_path);
+ if (!ifs.is_open()) {
+ throw std::invalid_argument("File does not exist");
+ }
+ std::vector<std::string> lines;
+ std::string line;
+ while (std::getline(ifs, line)) {
+ lines.emplace_back(line);
+ }
+ std::sort(lines.begin(), lines.end());
+ return lines;
+ }
+};
+
+INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+ LevenshteinBenchmarkTest,
+ Values(BenchmarkType::DfaExplicit,
+ BenchmarkType::DfaImplicit,
+ BenchmarkType::Legacy),
+ LevenshteinBenchmarkTest::stringify_params);
+
+// ("abc", 1) => "a"
+// ("abc", 3) => "abc"
+// ("abc", 7) => "abcabca"
+// ... and so on.
+std::string repeated_string(std::string_view str, uint32_t sz) {
+ uint32_t chunks = sz / str.size();
+ std::string ret;
+ ret.reserve(sz);
+ for (uint32_t i = 0; i < chunks; ++i) {
+ ret += str;
+ }
+ uint32_t rem = sz % str.size();
+ ret += str.substr(0, rem);
+ return ret;
+}
+
+TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_time) {
+ using vespalib::BenchmarkTimer;
+ const auto type = benchmark_type();
+ fprintf(stderr, "------ %s ------\n", to_s(type));
+ for (uint8_t k : {1, 2}) {
+ for (uint32_t sz : string_lengths()) {
+ // Use same string as both source and target. This is the worst case in that the entire
+ // string must be matched and any sparse representation is always maximally filled since
+ // we never expend any edits via mismatches.
+ // Also ensure that we have multiple out-edges per node (i.e. don't just repeat "AAA" etc.).
+ std::string str = repeated_string("abcde", sz);
+ double min_time_s;
+ if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
+ auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
+ : LevenshteinDfa::DfaType::Implicit;
+ auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ min_time_s = BenchmarkTimer::benchmark([&] {
+ auto res = dfa.match(str, nullptr); // not benchmarking successor generation
+ do_not_optimize_away(res);
+ }, 1.0);
+ } else {
+ min_time_s = BenchmarkTimer::benchmark([&] {
+ auto str_u32 = utf8_string_to_utf32(str); // Must be done per term, so included in benchmark body
+ auto res = vespalib::LevenshteinDistance::calculate(str_u32, str_u32, k);
+ do_not_optimize_away(res);
+ }, 1.0);
+ }
+ fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0);
+ }
+ }
+}
+
+TEST(LevenshteinExplicitDfaBenchmarkTest, benchmark_explicit_dfa_construction) {
+ if (!benchmarking_enabled()) {
+ GTEST_SKIP() << "benchmarking not enabled";
+ }
+ using vespalib::BenchmarkTimer;
+ for (uint8_t k : {1, 2}) {
+ for (uint32_t sz : string_lengths()) {
+ std::string str = repeated_string("abcde", sz);
+ double min_time_s = BenchmarkTimer::benchmark([&] {
+ auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+ do_not_optimize_away(dfa);
+ }, 2.0);
+ auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+ size_t mem_usage = dfa.memory_usage();
+ fprintf(stderr, "k=%u, sz=%u: \t%g us \t%zu bytes\n", k, sz, min_time_s * 1000000.0, mem_usage);
+ }
+ }
+}
+
+TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
+ using vespalib::BenchmarkTimer;
+ const auto type = benchmark_type();
+ const auto dict = load_dictionary_once();
+ std::vector target_lengths = {1, 2, 4, 8, 12, 16, 24, 32, 64};
+ fprintf(stderr, "------ %s ------\n", to_s(type));
+ for (uint8_t k : {1, 2}) {
+ for (uint32_t sz : target_lengths) {
+ std::string str = repeated_string("abcde", sz);
+ double min_time_s;
+ if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
+ auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
+ : LevenshteinDfa::DfaType::Implicit;
+ auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ min_time_s = BenchmarkTimer::benchmark([&] {
+ for (const auto& line : dict) {
+ auto res = dfa.match(line, nullptr);
+ do_not_optimize_away(res);
+ }
+ }, 2.0);
+ } else {
+ min_time_s = BenchmarkTimer::benchmark([&] {
+ auto target_u32 = utf8_string_to_utf32(str);
+ for (const auto& line : dict) {
+ auto line_u32 = utf8_string_to_utf32(line);
+ auto res = vespalib::LevenshteinDistance::calculate(line_u32, target_u32, k);
+ do_not_optimize_away(res);
+ }
+ }, 2.0);
+ }
+ fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0);
+ }
+ }
+}
+
+TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) {
+ const auto type = benchmark_type();
+ if (type == BenchmarkType::Legacy) {
+ GTEST_SKIP() << "Skipping not supported for legacy implementation";
+ }
+ using vespalib::BenchmarkTimer;
+ const auto dict = load_dictionary_once();
+ std::vector target_lengths = {1, 2, 4, 8, 12, 16, 24, 32, 64};
+ fprintf(stderr, "------ %s ------\n", to_s(type));
+ for (uint8_t k : {1, 2}) {
+ for (uint32_t sz : target_lengths) {
+ std::string str = repeated_string("abcde", sz);
+ auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
+ : LevenshteinDfa::DfaType::Implicit;
+ auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ double min_time_s = BenchmarkTimer::benchmark([&] {
+ auto iter = dict.cbegin();
+ auto end = dict.cend();
+ std::string successor;
+ while (iter != end) {
+ auto maybe_match = dfa.match(*iter, &successor);
+ if (maybe_match.matches()) {
+ ++iter;
+ } else {
+ iter = std::lower_bound(iter, end, successor);
+ }
+ }
+ }, 2.0);
+ fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0);
+ }
+ }
+}
+
+// TODO:
+// - explicit successor generation benchmark
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ if (argc > 1) {
+ benchmark_dictionary = argv[1];
+ if (!fs::exists(fs::path(benchmark_dictionary))) {
+ fprintf(stderr, "Benchmark dictionary file '%s' does not exist\n", benchmark_dictionary.c_str());
+ return 1;
+ }
+ }
+ return RUN_ALL_TESTS();
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
index 1d770163e06..bdbb03bcfee 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
@@ -1,8 +1,12 @@
# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
vespa_add_library(vespalib_vespalib_fuzzy OBJECT
- SOURCES
+ SOURCES
+ explicit_levenshtein_dfa.cpp
fuzzy_matcher.cpp
+ implicit_levenshtein_dfa.cpp
+ levenshtein_dfa.cpp
levenshtein_distance.cpp
- DEPENDS
- )
+ unicode_utils.cpp
+ DEPENDS
+)
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
new file mode 100644
index 00000000000..c445c60cc01
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
@@ -0,0 +1,70 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <concepts>
+#include <cstdint>
+
+namespace vespalib::fuzzy {
+
+// Concept that all DFA matcher implementations must satisfy
+template <typename T>
+concept DfaMatcher = requires(T a) {
+ typename T::StateType;
+ typename T::StateParamType;
+ typename T::EdgeType;
+
+ // Initial (starting) state of the DFA
+ { a.start() } -> std::same_as<typename T::StateType>;
+
+ // Whether a given state constitutes a string match within the maximum number of edits
+ { a.is_match(typename T::StateType{}) } -> std::same_as<bool>;
+
+ // Whether a given state _may_ result in a match, either in the given state or in the
+ // future if the remaining string input is within the max edit distance
+ { a.can_match(typename T::StateType{}) } -> std::same_as<bool>;
+
+ // Whether the given state is a valid state. Used for invariant checking.
+ { a.valid_state(typename T::StateType{}) } -> std::same_as<bool>;
+
+ // Iff the given state represents a terminal matching state, returns the number of
+ // edits required to reach the state. Otherwise, returns max edits + 1.
+ { a.match_edit_distance(typename T::StateType{}) } -> std::same_as<uint8_t>;
+
+ // Returns the state that is the result of matching the single logical Levenshtein
+ // matrix row represented by the given state with the input u32 character value.
+ { a.match_input(typename T::StateType{}, uint32_t{}) } -> std::same_as<typename T::StateType>;
+
+ // Returns the state that is the result of matching the single logical Levenshtein
+ // matrix row represented by the given state with a sentinel character that cannot
+ // match any character in the target string (i.e. is always a mismatch).
+ { a.match_wildcard(typename T::StateType{}) } -> std::same_as<typename T::StateType>;
+
+ // Whether there is exists an out edge from the given state that can accept a
+ // _higher_ UTF-32 code point value (character) than the input u32 value. Such an
+ // edge _may_ be a wildcard edge, which accepts any character.
+ { a.has_higher_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>;
+
+ // Whether there exists an out edge from the given state whose u32 character value
+ // _exactly_ matches the input u32 value.
+ { a.has_exact_explicit_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>;
+
+ // Returns the out edge `e` from the given state that satisfies _both_:
+ // 1. higher than the given u32 value
+ // 2. no other out edges are lower than `e`
+ // Only called in a context where the caller already knows that such an edge must exist.
+ { a.lowest_higher_explicit_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<typename T::EdgeType>;
+
+ // Returns the out edge from the given state that has the lowest character value
+ { a.smallest_explicit_out_edge(typename T::StateType{}) } -> std::same_as<typename T::EdgeType>;
+
+ // Whether the given edge is a valid edge. Used for invariant checking.
+ { a.valid_edge(typename T::EdgeType{}) } -> std::same_as<bool>;
+
+ // For a given edge, returns the UTF-32 code point value the edge represents
+ { a.edge_to_u32char(typename T::EdgeType{}) } -> std::same_as<uint32_t>;
+
+ // Returns the state that is the result of following the given edge from the given state.
+ { a.edge_to_state(typename T::StateType{}, typename T::EdgeType{}) } -> std::same_as<typename T::StateType>;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h
new file mode 100644
index 00000000000..7e7881c5a14
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h
@@ -0,0 +1,299 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "sparse_state.h"
+#include <span>
+
+namespace vespalib::fuzzy {
+
+template <typename Traits>
+struct DfaSteppingBase {
+ using StateType = Traits::StateType;
+ using TransitionsType = Traits::TransitionsType;
+
+ std::span<const uint32_t> _u32_str; // TODO std::u32string_view
+
+ DfaSteppingBase(std::span<const uint32_t> str) noexcept
+ : _u32_str(str)
+ {
+ }
+
+ [[nodiscard]] static constexpr uint8_t max_edits() noexcept {
+ return Traits::max_edits();
+ }
+
+ /**
+ * Returns the initial state of the DFA. This represents the first row in the
+ * Levenshtein matrix.
+ */
+ [[nodiscard]] StateType start() const {
+ StateType ret;
+ const auto j = std::min(static_cast<uint32_t>(max_edits()),
+ static_cast<uint32_t>(_u32_str.size())); // e.g. the empty string as target
+ for (uint32_t i = 0; i <= j; ++i) {
+ ret.append(i, i);
+ }
+ return ret;
+ }
+
+ /**
+ * DFA stepping function that takes an input (sparse) state and a 32-bit character value
+ * (does not have to be valid UTF-32, but usually is) and generates a resulting state
+ * that represents applying the Levenshtein algorithm on a particular matrix row using
+ * the provided source string character.
+ *
+ * The returned state only includes elements where the edit distance (cost) is within
+ * the maximum number of edits. All other elements are implicitly beyond the max
+ * edit distance. It doesn't matter _how_ far beyond they are, since we have a fixed
+ * maximum to consider.
+ *
+ * Stepping a non-matching state S (can_match(S) == false) results in another non-
+ * matching state.
+ *
+ * As an example, this is a visualization of stepping through all source characters of
+ * the string "fxod" when matching the target string "food" with max edits k=1.
+ * Note: the actual internal representation is logical <column#, cost> tuples, but
+ * rendering as a matrix makes things easier to understand. Elements _not_ part of the
+ * state are rendered as '-'.
+ *
+ * f o o d
+ * start(): [0 1 - - -]
+ * 'f': [1 0 1 - -]
+ * 'x': [- 1 1 - -]
+ * 'o': [- - 1 1 -]
+ * 'd': [- - - - 1]
+ *
+ * In this case, the resulting edit distance is 1, with one substitution 'x' -> 'o'.
+ *
+ * If we pull out our trusty pen & paper and do the full matrix calculations, we see
+ * that the above is equivalent to the full matrix with all costs > k pruned away:
+ *
+ * f o o d
+ * [0 1 2 3 4]
+ * f [1 0 1 2 3]
+ * x [2 1 1 2 3]
+ * o [3 2 1 1 2]
+ * d [4 3 2 2 1]
+ *
+ * Since we're working on sparse states, stepping requires a bit of manual edge case
+ * handling when compared to a dense representation.
+ *
+ * We first have to handle the case where our state includes the 0th matrix column.
+ * In an explicit Levenshtein matrix of target string length n, source string length m,
+ * the first column is always the values [0, m], increasing with 1 per row (the first
+ * _row_ is handled by start()).
+ *
+ * To mirror this, if our sparse state includes column 0 we have to increment it by 1,
+ * unless doing so would bring the cost beyond our max number of edits, in which case
+ * we don't bother including the column in the new state at all. These correspond to
+ * the start() -> 'f' -> 'x' transitions in the example above.
+ *
+ * What remains is then to do the actual Levenshtein insert/delete/substitute formula
+ * for matching positions in the matrix. Let d represent the logical (full) Levenshtein
+ * distance matrix and cell d[i, j] be the minimum number of edits between source string
+ * character at i+1 and target string character at j+1:
+ *
+ * Insertion cost: d[i, j-1] + 1
+ * Deletion cost: d[i-1, j] + 1
+ * Substitution cost: d[i-1, j-1] + (s[i-1] == t[j-1] ? 1 :0)
+ *
+ * d[i, j] = min(Insertion cost, Deletion cost, Substitution cost)
+ *
+ * We have to turn this slightly on the head, as instead of going through a matrix row
+ * and "pulling" values from the previous row, we have to go through a state representing
+ * the previous row and "push" new values instead (iff these values are within max edits).
+ * This also means we compute costs for indexes offset by 1 from the source state index
+ * (can be visualized as the element one down diagonally to the right).
+ *
+ * Insertion considers the current row only, i.e. the state being generated. We always
+ * work left to right in column order, so we can check if the last element (if any)
+ * in our _new_ sparse state is equal to the index of our source state element. If not,
+ * we know that it was beyond max edits. Max edits + 1 is inherently beyond max edits
+ * and need not be included.
+ *
+ * Deletion considers the cell directly above our own, which is part of the input state
+ * if it exists. Since we're computing the costs of cells at index + 1, we know that the
+ * only way for this cell to be present in the state is if the _next_ element of our
+ * input state exists and has an index equal to index + 1. If so, the deletion cost is
+ * the cost recorded for this element + 1.
+ *
+ * Substitution considers the cell diagonally up to the left. This very conveniently
+ * happens to be the input state cell we're currently working from, so it's therefore
+ * always present.
+ *
+ * Example stepping with c='x', max edits k=1:
+ *
+ * ====== Initially ======
+ *
+ * f o o d
+ * state_in: [1 0 1 - -] (0:1, 1:0, 2:1)
+ * out: [] ()
+ *
+ * We have a 0th column in state_in, but incrementing it results in 2 > k, so not
+ * appended to out.
+ *
+ * ====== State (0:1), computing for index 1 ======
+ *
+ * Insertion: out state is empty (no cell to our left), so implicit insertion cost
+ * is > k
+ * Deletion: state_in[1] is (1:0), which means it represents the cell just above
+ * index 1. Deletion cost is therefore 0+1 = 1
+ * Substitution: (t[0] = 'f') != (c = 'x'), so substitution cost is 1+1 = 2
+ *
+ * Min cost is 1, which is <= k. Appending to output.
+ *
+ * out: [- 1] (1:1)
+ *
+ * ====== State (1:0), computing for index 2 ======
+ *
+ * Insertion: last element in out has index 1 (cell to our immediate left) with cost
+ * 1, so insertion cost is 1+1 = 2
+ * Deletion: state_in[2] is (2:1), which means it represents the cell just above
+ * index 2. Deletion cost is therefore 1+1 = 2
+ * Substitution: (t[1] = 'o') != (c = 'x'), so substitution cost is 0+1 = 1
+ *
+ * Min cost is 1, which is <= k. Appending to output.
+ *
+ * out: [- 1 1] (1:1, 2:1)
+ *
+ * ====== State (2:1), computing for index 3 ======
+ *
+ * Insertion: last element in out has index 2 (cell to our immediate left) with cost
+ * 1, so insertion cost is 1+1 = 2
+ * Deletion: state_in[3] does not exist, so implicit deletion cost is > k
+ * Substitution: (t[2] = 'o') != (c = 'x'), so substitution cost is 1+1 = 2
+ *
+ * Min cost is 2, which is > k. Not appending to output.
+ *
+ * Resulting output state (right-padded for clarity):
+ *
+ * [- 1 1 - -] (1:1, 2:1)
+ *
+ */
+ [[nodiscard]] StateType step(const StateType& state_in, uint32_t c) const {
+ if (state_in.empty()) {
+ return state_in; // A non-matching state can only step to another equally non-matching state
+ }
+ StateType new_state;
+ if ((state_in.index(0) == 0) && (state_in.cost(0) < max_edits())) {
+ new_state.append(0, state_in.cost(0) + 1);
+ }
+ for (uint32_t i = 0; i < state_in.size(); ++i) {
+ const auto idx = state_in.index(i);
+ if (idx == _u32_str.size()) [[unlikely]] {
+ break; // Can't process beyond matrix width
+ }
+ const uint8_t sub_cost = (_u32_str[idx] == c) ? 0 : 1;
+ // For our Levenshtein insert/delete/sub ops, we know that if a particular index is _not_
+ // in the sparse state, its implicit distance is beyond the max edits, and need not be
+ // considered.
+ auto dist = state_in.cost(i) + sub_cost; // (Substitution)
+ if (!new_state.empty() && (new_state.last_index() == idx)) { // (Insertion) anything to our immediate left?
+ dist = std::min(dist, new_state.last_cost() + 1);
+ }
+ if ((i < state_in.size() - 1) && (state_in.index(i + 1) == idx + 1)) { // (Deletion) anything immediately above?
+ dist = std::min(dist, state_in.cost(i + 1) + 1);
+ }
+ if (dist <= max_edits()) {
+ new_state.append(idx + 1, dist);
+ }
+ }
+ return new_state;
+ }
+
+ /**
+ * Simplified version of step() which does not assemble a new state, but only checks
+ * whether _any_ mismatching character can be substituted in and still result in a
+ * potentially matching state. This is the case if the resulting state would contain
+ * _at least one_ entry (recalling that we only retain entries that are within the
+ * max number of edits).
+ *
+ * Consider using this directly instead of `can_match(step(state, WILDCARD))`,
+ * which has the exact same semantics, but requires computing the full (sparse)
+ * state before checking if it has any element at all. can_wildcard_step() just
+ * jumps straight to the last part.
+ */
+ [[nodiscard]] bool can_wildcard_step(const StateType& state_in) const noexcept {
+ if (state_in.empty()) {
+ return false; // by definition
+ }
+ if ((state_in.index(0) == 0) && (state_in.cost(0) < max_edits())) {
+ return true;
+ }
+ for (uint32_t i = 0; i < state_in.size(); ++i) {
+ const auto idx = state_in.index(i);
+ if (idx == _u32_str.size()) [[unlikely]] {
+ break;
+ }
+ const uint8_t sub_cost = 1; // by definition
+ auto dist = state_in.cost(i) + sub_cost;
+ // Insertion only looks at the entries already computed in the current row
+ // and always increases the cost by 1. Since we always bail out immediately if
+ // there would have been at least one entry within max edits, we transitively
+ // know that since we have not bailed out yet there is no way we can get here
+ // and have insertion actually yield a match. So skip computing it entirely.
+ if ((i < state_in.size() - 1) && (state_in.index(i + 1) == idx + 1)) {
+ dist = std::min(dist, state_in.cost(i + 1) + 1);
+ }
+ if (dist <= max_edits()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks if the given state represents a terminal state within the max number of edits
+ */
+ [[nodiscard]] bool is_match(const StateType& state) const noexcept {
+ // If the last index is equal to the string's length, it means we were able to match
+ // the entire string and still be within the max edit distance.
+ return (!state.empty() && state.last_index() == static_cast<uint32_t>(_u32_str.size()));
+ }
+
+ /**
+ * Iff the input state represents a terminal matching state, returns the number of
+ * edits required to reach the state. Otherwise, returns max edits + 1.
+ */
+ [[nodiscard]] uint8_t match_edit_distance(const StateType& state) const noexcept {
+ if (!is_match(state)) {
+ return max_edits() + 1;
+ }
+ return state.last_cost();
+ }
+
+ /**
+ * Returns whether the given state _may_ end up matching the target string,
+ * depending on the remaining source string characters.
+ *
+ * Note: is_match(s) => can_match(s) is true, but
+ * can_match(s) => is_match(s) is false
+ */
+ [[nodiscard]] bool can_match(const StateType& state) const noexcept {
+ // The presence of any entries at all indicates that we may still potentially match
+ // the target string if the remaining input is within the maximum number of edits.
+ return !state.empty();
+ }
+
+ /**
+ * All valid character transitions from this state are those that are reachable
+ * within the max edit distance.
+ */
+ TransitionsType transitions(const StateType& state) const {
+ TransitionsType t;
+ for (size_t i = 0; i < state.size(); ++i) {
+ const auto idx = state.index(i);
+ if (idx < _u32_str.size()) [[likely]] {
+ t.add_char(_u32_str[idx]);
+ }
+ }
+ // We must ensure transitions are in increasing character order, so that the
+ // lowest possible higher char than any candidate char can be found with a
+ // simple "first-fit" linear scan.
+ t.sort();
+ return t;
+ }
+
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg b/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg
new file mode 100644
index 00000000000..0974e1d161f
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg
@@ -0,0 +1,286 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><!-- Generated by graphviz version 2.40.1 (20161225.0304)
+ --><!-- Title: levenshtein_dfa Pages: 1 --><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="333pt" height="488pt" viewBox="0.00 0.00 333.00 488.00">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 484)">
+<title>levenshtein_dfa</title>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-484 329,-484 329,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="#000000" cx="211" cy="-462" rx="18" ry="18"/>
+<text text-anchor="middle" x="211" y="-457.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">0</text>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="#000000" cx="157" cy="-373.2" rx="18" ry="18"/>
+<text text-anchor="middle" x="157" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">1</text>
+</g>
+<!-- 0&#45;&gt;1 -->
+<g id="edge1" class="edge">
+<title>0-&gt;1</title>
+<path fill="none" stroke="#000000" d="M201.5939,-446.5322C193.3592,-432.9906 181.2571,-413.0894 171.7374,-397.4348"/>
+<polygon fill="#000000" stroke="#000000" points="174.6658,-395.5142 166.4795,-388.7885 168.6849,-399.1513 174.6658,-395.5142"/>
+<text text-anchor="middle" x="189.9453" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<ellipse fill="none" stroke="#000000" cx="211" cy="-373.2" rx="18" ry="18"/>
+<text text-anchor="middle" x="211" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">2</text>
+</g>
+<!-- 0&#45;&gt;2 -->
+<g id="edge2" class="edge">
+<title>0-&gt;2</title>
+<path fill="none" stroke="#000000" d="M211,-443.6006C211,-431.4949 211,-415.4076 211,-401.6674"/>
+<polygon fill="#000000" stroke="#000000" points="214.5001,-401.272 211,-391.272 207.5001,-401.2721 214.5001,-401.272"/>
+<text text-anchor="middle" x="214.8913" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<ellipse fill="none" stroke="#000000" cx="265" cy="-373.2" rx="18" ry="18"/>
+<text text-anchor="middle" x="265" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">3</text>
+</g>
+<!-- 0&#45;&gt;3 -->
+<g id="edge3" class="edge">
+<title>0-&gt;3</title>
+<path fill="none" stroke="#000000" d="M220.4061,-446.5322C228.6408,-432.9906 240.7429,-413.0894 250.2626,-397.4348"/>
+<polygon fill="#000000" stroke="#000000" points="253.3151,-399.1513 255.5205,-388.7885 247.3342,-395.5142 253.3151,-399.1513"/>
+<text text-anchor="middle" x="244.7223" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<ellipse fill="none" stroke="#000000" cx="157" cy="-284.4" rx="18" ry="18"/>
+<text text-anchor="middle" x="157" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">4</text>
+</g>
+<!-- 1&#45;&gt;4 -->
+<g id="edge4" class="edge">
+<title>1-&gt;4</title>
+<path fill="none" stroke="#000000" d="M157,-354.8006C157,-342.6949 157,-326.6076 157,-312.8674"/>
+<polygon fill="#000000" stroke="#000000" points="160.5001,-312.472 157,-302.472 153.5001,-312.4721 160.5001,-312.472"/>
+<text text-anchor="middle" x="158.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text>
+</g>
+<!-- 1&#45;&gt;4 -->
+<g id="edge6" class="edge">
+<title>1-&gt;4</title>
+<path fill="none" stroke="#000000" d="M156.1231,-354.8902C155.8898,-349.221 155.6708,-342.9535 155.5554,-337.2 155.4057,-329.7348 155.4057,-327.8652 155.5554,-320.4 155.6041,-317.9728 155.6712,-315.454 155.7501,-312.9273"/>
+<polygon fill="#000000" stroke="#000000" points="159.2558,-312.8309 156.1231,-302.7098 152.2605,-312.5755 159.2558,-312.8309"/>
+<text text-anchor="middle" x="158.7223" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<ellipse fill="none" stroke="#000000" cx="103" cy="-284.4" rx="18" ry="18"/>
+<text text-anchor="middle" x="103" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">5</text>
+</g>
+<!-- 1&#45;&gt;5 -->
+<g id="edge5" class="edge">
+<title>1-&gt;5</title>
+<path fill="none" stroke="#000000" d="M147.5939,-357.7322C139.3592,-344.1906 127.2571,-324.2894 117.7374,-308.6348"/>
+<polygon fill="#000000" stroke="#000000" points="120.6658,-306.7142 112.4795,-299.9885 114.6849,-310.3513 120.6658,-306.7142"/>
+<text text-anchor="middle" x="138.8913" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<ellipse fill="none" stroke="#000000" cx="307" cy="-284.4" rx="18" ry="18"/>
+<text text-anchor="middle" x="307" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">6</text>
+</g>
+<!-- 2&#45;&gt;6 -->
+<g id="edge7" class="edge">
+<title>2-&gt;6</title>
+<path fill="none" stroke="#000000" d="M224.3484,-360.8527C240.3968,-346.0079 267.511,-320.9274 286.2858,-303.5607"/>
+<polygon fill="#000000" stroke="#000000" points="288.8006,-306.0022 293.765,-296.6424 284.0473,-300.8635 288.8006,-306.0022"/>
+<text text-anchor="middle" x="268.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<ellipse fill="none" stroke="#000000" cx="190" cy="-195.6" rx="18" ry="18"/>
+<text text-anchor="middle" x="190" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">7</text>
+</g>
+<!-- 2&#45;&gt;7 -->
+<g id="edge8" class="edge">
+<title>2-&gt;7</title>
+<path fill="none" stroke="#000000" d="M208.8708,-355.1934C205.2075,-324.212 197.6865,-260.606 193.3267,-223.7346"/>
+<polygon fill="#000000" stroke="#000000" points="196.7849,-223.1741 192.1348,-213.6543 189.8333,-223.9961 196.7849,-223.1741"/>
+<text text-anchor="middle" x="205.8913" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 3&#45;&gt;6 -->
+<g id="edge9" class="edge">
+<title>3-&gt;6</title>
+<path fill="none" stroke="#000000" d="M272.7034,-356.9127C278.881,-343.8515 287.6665,-325.2765 294.7989,-310.1966"/>
+<polygon fill="#000000" stroke="#000000" points="298.0914,-311.4211 299.2032,-300.8848 291.7635,-308.4281 298.0914,-311.4211"/>
+<text text-anchor="middle" x="290.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text>
+</g>
+<!-- 8 -->
+<g id="node9" class="node">
+<title>8</title>
+<ellipse fill="none" stroke="#000000" cx="263" cy="-195.6" rx="18" ry="18"/>
+<text text-anchor="middle" x="263" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">8</text>
+</g>
+<!-- 3&#45;&gt;8 -->
+<g id="edge10" class="edge">
+<title>3-&gt;8</title>
+<path fill="none" stroke="#000000" d="M264.7972,-355.1934C264.4483,-324.212 263.732,-260.606 263.3168,-223.7346"/>
+<polygon fill="#000000" stroke="#000000" points="266.8158,-223.6142 263.2033,-213.6543 259.8162,-223.6931 266.8158,-223.6142"/>
+<text text-anchor="middle" x="267.8913" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 4&#45;&gt;7 -->
+<g id="edge11" class="edge">
+<title>4-&gt;7</title>
+<path fill="none" stroke="#000000" d="M163.3627,-267.2785C168.1106,-254.5023 174.6869,-236.8062 180.1171,-222.194"/>
+<polygon fill="#000000" stroke="#000000" points="183.4541,-223.2619 183.6568,-212.669 176.8925,-220.8234 183.4541,-223.2619"/>
+<text text-anchor="middle" x="180.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 5&#45;&gt;7 -->
+<g id="edge14" class="edge">
+<title>5-&gt;7</title>
+<path fill="none" stroke="#000000" d="M115.8371,-271.2973C130.1833,-256.6543 153.5826,-232.7709 170.2601,-215.7483"/>
+<polygon fill="#000000" stroke="#000000" points="172.8948,-218.0603 177.393,-208.4678 167.8946,-213.1615 172.8948,-218.0603"/>
+<text text-anchor="middle" x="157.7223" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+<!-- 9 -->
+<g id="node10" class="node">
+<title>9</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="60" cy="-195.6" rx="18" ry="18"/>
+<text text-anchor="middle" x="60" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">9(1)</text>
+</g>
+<!-- 5&#45;&gt;9 -->
+<g id="edge12" class="edge">
+<title>5-&gt;9</title>
+<path fill="none" stroke="#000000" d="M95.1131,-268.1127C88.7885,-255.0515 79.7938,-236.4765 72.4916,-221.3966"/>
+<polygon fill="#000000" stroke="#000000" points="75.4909,-219.5597 67.9825,-212.0848 69.1907,-222.6105 75.4909,-219.5597"/>
+<text text-anchor="middle" x="89.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 10 -->
+<g id="node11" class="node">
+<title>10</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="126" cy="-195.6" rx="18" ry="18"/>
+<text text-anchor="middle" x="126" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">10(1)</text>
+</g>
+<!-- 5&#45;&gt;10 -->
+<g id="edge13" class="edge">
+<title>5-&gt;10</title>
+<path fill="none" stroke="#000000" d="M107.5441,-266.856C110.7887,-254.3287 115.2168,-237.2326 118.9222,-222.9264"/>
+<polygon fill="#000000" stroke="#000000" points="122.3462,-223.6657 121.4654,-213.1076 115.5698,-221.9105 122.3462,-223.6657"/>
+<text text-anchor="middle" x="120.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 6&#45;&gt;8 -->
+<g id="edge15" class="edge">
+<title>6-&gt;8</title>
+<path fill="none" stroke="#000000" d="M298.9297,-268.1127C292.4172,-254.9694 283.1382,-236.2426 275.6413,-221.1125"/>
+<polygon fill="#000000" stroke="#000000" points="278.5951,-219.1904 271.0191,-211.784 272.3229,-222.2984 278.5951,-219.1904"/>
+<text text-anchor="middle" x="291.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 11 -->
+<g id="node12" class="node">
+<title>11</title>
+<ellipse fill="none" stroke="#000000" cx="170" cy="-106.8" rx="18" ry="18"/>
+<text text-anchor="middle" x="170" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">11</text>
+</g>
+<!-- 7&#45;&gt;11 -->
+<g id="edge16" class="edge">
+<title>7-&gt;11</title>
+<path fill="none" stroke="#000000" d="M185.9527,-177.63C183.1694,-165.2722 179.4189,-148.6197 176.2537,-134.5662"/>
+<polygon fill="#000000" stroke="#000000" points="179.5848,-133.4268 173.973,-124.4402 172.7558,-134.9649 179.5848,-133.4268"/>
+<text text-anchor="middle" x="184.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 12 -->
+<g id="node13" class="node">
+<title>12</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="113" cy="-18" rx="18" ry="18"/>
+<text text-anchor="middle" x="113" y="-13.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">12(1)</text>
+</g>
+<!-- 7&#45;&gt;12 -->
+<g id="edge17" class="edge">
+<title>7-&gt;12</title>
+<path fill="none" stroke="#000000" d="M195.7305,-178.1814C201.8942,-156.377 209.2729,-118.3097 197,-88.8 185.9488,-62.2279 158.7426,-42.3831 138.2647,-30.5667"/>
+<polygon fill="#000000" stroke="#000000" points="139.8619,-27.4509 129.4116,-25.7072 136.4936,-33.5872 139.8619,-27.4509"/>
+<text text-anchor="middle" x="206.8913" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 8&#45;&gt;11 -->
+<g id="edge18" class="edge">
+<title>8-&gt;11</title>
+<path fill="none" stroke="#000000" d="M249.6754,-182.8771C234.3441,-168.2382 208.9734,-144.0133 190.9836,-126.8359"/>
+<polygon fill="#000000" stroke="#000000" points="192.9495,-123.8738 183.2999,-119.4993 188.1154,-128.9366 192.9495,-123.8738"/>
+<text text-anchor="middle" x="227.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 13 -->
+<g id="node14" class="node">
+<title>13</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="18" cy="-106.8" rx="18" ry="18"/>
+<text text-anchor="middle" x="18" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">13(1)</text>
+</g>
+<!-- 9&#45;&gt;13 -->
+<g id="edge19" class="edge">
+<title>9-&gt;13</title>
+<path fill="none" stroke="#000000" d="M44.2356,-186.804C34.7839,-180.5699 23.5878,-171.2526 18.2174,-159.6 14.7103,-151.9903 13.6799,-143.0834 13.8048,-134.7757"/>
+<polygon fill="#000000" stroke="#000000" points="17.3023,-134.9283 14.4812,-124.716 10.3181,-134.4586 17.3023,-134.9283"/>
+<text text-anchor="middle" x="22.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 9&#45;&gt;13 -->
+<g id="edge21" class="edge">
+<title>9-&gt;13</title>
+<path fill="none" stroke="#000000" d="M52.2966,-179.3127C46.119,-166.2515 37.3335,-147.6765 30.2011,-132.5966"/>
+<polygon fill="#000000" stroke="#000000" points="33.2365,-130.8281 25.7968,-123.2848 26.9086,-133.8211 33.2365,-130.8281"/>
+<text text-anchor="middle" x="45.7223" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+<!-- 14 -->
+<g id="node15" class="node">
+<title>14</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="72" cy="-106.8" rx="18" ry="18"/>
+<text text-anchor="middle" x="72" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">14(0)</text>
+</g>
+<!-- 9&#45;&gt;14 -->
+<g id="edge20" class="edge">
+<title>9-&gt;14</title>
+<path fill="none" stroke="#000000" d="M62.4284,-177.63C64.0874,-165.353 66.3193,-148.8372 68.2105,-134.8421"/>
+<polygon fill="#000000" stroke="#000000" points="71.7044,-135.1219 69.5752,-124.7433 64.7675,-134.1845 71.7044,-135.1219"/>
+<text text-anchor="middle" x="71.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 10&#45;&gt;11 -->
+<g id="edge22" class="edge">
+<title>10-&gt;11</title>
+<path fill="none" stroke="#000000" d="M134.0703,-179.3127C140.5828,-166.1694 149.8618,-147.4426 157.3587,-132.3125"/>
+<polygon fill="#000000" stroke="#000000" points="160.6771,-133.4984 161.9809,-122.984 154.4049,-130.3904 160.6771,-133.4984"/>
+<text text-anchor="middle" x="155.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 10&#45;&gt;12 -->
+<g id="edge23" class="edge">
+<title>10-&gt;12</title>
+<path fill="none" stroke="#000000" d="M124.6819,-177.5934C122.4142,-146.612 117.7583,-83.006 115.0594,-46.1346"/>
+<polygon fill="#000000" stroke="#000000" points="118.5423,-45.772 114.3215,-36.0543 111.561,-46.2831 118.5423,-45.772"/>
+<text text-anchor="middle" x="124.8913" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 11&#45;&gt;12 -->
+<g id="edge24" class="edge">
+<title>11-&gt;12</title>
+<path fill="none" stroke="#000000" d="M161.5849,-90.7772C155.8346,-80.1448 147.859,-65.9928 140,-54 137.0775,-49.5402 133.7942,-44.8997 130.5502,-40.4917"/>
+<polygon fill="#000000" stroke="#000000" points="133.2767,-38.294 124.4657,-32.4105 127.6846,-42.5045 133.2767,-38.294"/>
+<text text-anchor="middle" x="153.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 13&#45;&gt;12 -->
+<g id="edge25" class="edge">
+<title>13-&gt;12</title>
+<path fill="none" stroke="#000000" d="M29.0516,-92.0851C37.7254,-81.0086 50.4346,-65.7741 63.2174,-54 71.136,-46.7062 80.5495,-39.5675 89.0481,-33.5954"/>
+<polygon fill="#000000" stroke="#000000" points="91.192,-36.3695 97.4722,-27.8367 87.2416,-30.5907 91.192,-36.3695"/>
+<text text-anchor="middle" x="67.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 14&#45;&gt;12 -->
+<g id="edge26" class="edge">
+<title>14-&gt;12</title>
+<path fill="none" stroke="#000000" d="M79.7118,-90.0974C85.7248,-77.0741 94.1818,-58.7574 101.0669,-43.8455"/>
+<polygon fill="#000000" stroke="#000000" points="104.3085,-45.1739 105.3228,-34.6277 97.9532,-42.2395 104.3085,-45.1739"/>
+<text text-anchor="middle" x="100.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 14&#45;&gt;12 -->
+<g id="edge27" class="edge">
+<title>14-&gt;12</title>
+<path fill="none" stroke="#000000" d="M71.9481,-88.4225C72.5604,-77.8966 74.4493,-64.6966 79.5554,-54 82.4978,-47.8362 86.8709,-42.0067 91.4921,-36.9052"/>
+<polygon fill="#000000" stroke="#000000" points="94.2341,-39.1107 98.7881,-29.5445 89.2625,-34.1828 94.2341,-39.1107"/>
+<text text-anchor="middle" x="82.7223" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+</g>
+</svg> \ No newline at end of file
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp
new file mode 100644
index 00000000000..f78de5cc082
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp
@@ -0,0 +1,11 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "explicit_levenshtein_dfa.hpp"
+
+namespace vespalib::fuzzy {
+
+template class ExplicitLevenshteinDfaImpl<1>;
+template class ExplicitLevenshteinDfaImpl<2>;
+template class ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>;
+template class ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>;
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
new file mode 100644
index 00000000000..49baad21530
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -0,0 +1,147 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "dfa_stepping_base.h"
+#include "levenshtein_dfa.h"
+#include "sparse_state.h"
+#include "unicode_utils.h"
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+// A doomed state is one that cannot possibly match the target string
+constexpr const uint32_t DOOMED = UINT32_MAX;
+
+template <uint8_t MaxEdits>
+struct DfaNode {
+ static constexpr uint8_t MaxCharOutEdges = diag(MaxEdits); // Not counting wildcard edge
+
+ struct Edge {
+ uint32_t u32ch;
+ uint32_t node;
+ };
+
+ std::array<Edge, MaxCharOutEdges> match_out_edges_buf;
+ uint32_t wildcard_edge_to = DOOMED;
+ uint8_t num_match_out_edges = 0;
+ uint8_t edits = UINT8_MAX;
+
+ [[nodiscard]] bool has_wildcard_edge() const noexcept {
+ return wildcard_edge_to != DOOMED;
+ }
+
+ [[nodiscard]] uint32_t wildcard_edge_to_or_doomed() const noexcept {
+ return wildcard_edge_to;
+ }
+
+ [[nodiscard]] std::span<const Edge> match_out_edges() const noexcept {
+ return std::span(match_out_edges_buf.begin(), num_match_out_edges);
+ }
+
+ [[nodiscard]] uint32_t match_or_doomed(uint32_t ch) const noexcept {
+ // Always prefer the exact matching edges
+ for (const auto& e : match_out_edges()) {
+ if (e.u32ch == ch) {
+ return e.node;
+ }
+ }
+ // Fallback to wildcard edge if possible (could be doomed)
+ return wildcard_edge_to;
+ }
+
+ [[nodiscard]] bool has_exact_match(uint32_t ch) const noexcept {
+ for (const auto& e : match_out_edges()) {
+ if (e.u32ch == ch) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ [[nodiscard]] size_t has_higher_out_edge(uint32_t ch) const noexcept {
+ if (has_wildcard_edge()) {
+ return true; // implicitly possible to substitute a higher out edge char
+ }
+ return lowest_higher_explicit_out_edge(ch) != nullptr;
+ }
+
+ [[nodiscard]] const Edge* lowest_higher_explicit_out_edge(uint32_t ch) const noexcept {
+ // Important: these _must_ be sorted in increasing code point order
+ for (const auto& e : match_out_edges()) {
+ if (e.u32ch > ch) {
+ return &e;
+ }
+ }
+ return nullptr;
+ }
+
+ void add_match_out_edge(uint32_t out_char, uint32_t out_node) noexcept {
+ assert(num_match_out_edges < MaxCharOutEdges);
+ match_out_edges_buf[num_match_out_edges] = Edge(out_char, out_node);
+ ++num_match_out_edges;
+ }
+
+ void set_wildcard_out_edge(uint32_t out_node) noexcept {
+ assert(wildcard_edge_to == DOOMED);
+ wildcard_edge_to = out_node;
+ }
+};
+
+template <uint8_t MaxEdits>
+class ExplicitLevenshteinDfaImpl final : public LevenshteinDfa::Impl {
+public:
+ static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2);
+
+ using DfaNodeType = DfaNode<MaxEdits>;
+ using MatchResult = LevenshteinDfa::MatchResult;
+private:
+ std::vector<DfaNodeType> _nodes;
+public:
+ ExplicitLevenshteinDfaImpl() noexcept = default;
+ ~ExplicitLevenshteinDfaImpl() override = default;
+
+ static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
+
+ void ensure_node_array_large_enough_for_index(uint32_t node_index) {
+ if (node_index >= _nodes.size()) {
+ _nodes.resize(node_index + 1);
+ }
+ }
+
+ void set_node_edit_distance(uint32_t node_index, uint8_t edits) {
+ _nodes[node_index].edits = edits;
+ }
+
+ void add_outgoing_edge(uint32_t from_node_idx, uint32_t to_node_idx, uint32_t out_char) {
+ _nodes[from_node_idx].add_match_out_edge(out_char, to_node_idx);
+ }
+
+ void set_wildcard_edge(uint32_t from_node_idx, uint32_t to_node_idx) {
+ _nodes[from_node_idx].set_wildcard_out_edge(to_node_idx);
+ }
+
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
+
+ [[nodiscard]] size_t memory_usage() const noexcept override {
+ return sizeof(DfaNodeType) * _nodes.size();
+ }
+
+ void dump_as_graphviz(std::ostream& os) const override;
+};
+
+template <typename Traits>
+class ExplicitLevenshteinDfaBuilder {
+ std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+public:
+ explicit ExplicitLevenshteinDfaBuilder(std::string_view str)
+ : ExplicitLevenshteinDfaBuilder(utf8_string_to_utf32(str))
+ {}
+
+ explicit ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str) noexcept
+ : _u32_str_buf(std::move(str))
+ {}
+
+ [[nodiscard]] LevenshteinDfa build_dfa() const;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
new file mode 100644
index 00000000000..0960219aff3
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -0,0 +1,228 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "explicit_levenshtein_dfa.h"
+#include "match_algorithm.hpp"
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
+#include <iostream>
+#include <span>
+#include <queue>
+
+namespace vespalib::fuzzy {
+
+// DfaMatcher adapter for explicit DFA implementation
+template <uint8_t MaxEdits>
+struct ExplicitDfaMatcher {
+ using DfaNodeType = typename ExplicitLevenshteinDfaImpl<MaxEdits>::DfaNodeType;
+ using StateType = const DfaNodeType*;
+ using EdgeType = const DfaNodeType::Edge*;
+
+ using StateParamType = const DfaNodeType*;
+
+ const std::span<const DfaNodeType> _nodes;
+
+ explicit ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes) noexcept
+ : _nodes(nodes)
+ {}
+
+ static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
+
+ StateType start() const noexcept {
+ return &_nodes[0];
+ }
+ bool has_higher_out_edge(StateType node, uint32_t mch) const noexcept {
+ return node->has_higher_out_edge(mch);
+ }
+ StateType match_input(StateType node, uint32_t mch) const noexcept {
+ auto maybe_node_idx = node->match_or_doomed(mch);
+ return ((maybe_node_idx != DOOMED) ? &_nodes[maybe_node_idx] : nullptr);
+ }
+ bool is_match(StateType node) const noexcept {
+ return node->edits <= max_edits();
+ }
+ bool can_match(StateType node) const noexcept {
+ return node != nullptr;
+ }
+ uint8_t match_edit_distance(StateType node) const noexcept {
+ return node->edits;
+ }
+ bool valid_state(StateType node) const noexcept {
+ return node != nullptr;
+ }
+ StateType match_wildcard(StateType node) const noexcept {
+ auto edge_to = node->wildcard_edge_to_or_doomed();
+ return ((edge_to != DOOMED) ? &_nodes[edge_to] : nullptr);
+ }
+ bool has_exact_explicit_out_edge(StateType node, uint32_t ch) const noexcept {
+ return node->has_exact_match(ch);
+ }
+ EdgeType lowest_higher_explicit_out_edge(StateType node, uint32_t ch) const noexcept {
+ return node->lowest_higher_explicit_out_edge(ch);
+ }
+ EdgeType smallest_explicit_out_edge(StateType node) const noexcept {
+ // Out-edges are pre-ordered in increasing code point order, so the first
+ // element is always the smallest possible matching character.
+ assert(!node->match_out_edges().empty());
+ return &node->match_out_edges().front();
+ }
+ bool valid_edge(EdgeType edge) const noexcept {
+ return edge != nullptr;
+ }
+ uint32_t edge_to_u32char(EdgeType edge) const noexcept {
+ return edge->u32ch;
+ }
+ StateType edge_to_state([[maybe_unused]] StateType node, EdgeType edge) const noexcept {
+ return &_nodes[edge->node];
+ }
+};
+
+template <uint8_t MaxEdits>
+LevenshteinDfa::MatchResult
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
+ ExplicitDfaMatcher<MaxEdits> matcher(_nodes);
+ return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
+}
+
+template <uint8_t MaxEdits>
+void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) const {
+ os << std::dec << "digraph levenshtein_dfa {\n";
+ os << " fontname=\"Helvetica,Arial,sans-serif\"\n";
+ os << " node [shape=circle, fontname=\"Helvetica,Arial,sans-serif\", fixedsize=true];\n";
+ os << " edge [fontname=\"Helvetica,Arial,sans-serif\"];\n";
+ for (size_t i = 0; i < _nodes.size(); ++i) {
+ const auto& node = _nodes[i];
+ if (node.edits <= max_edits()) {
+ os << " " << i << " [label=\"" << i << "(" << static_cast<int>(node.edits) << ")\", style=\"filled\"];\n";
+ }
+ for (const auto& edge : node.match_out_edges()) {
+ std::string as_utf8;
+ append_utf32_char_as_utf8(as_utf8, edge.u32ch);
+ os << " " << i << " -> " << edge.node << " [label=\"" << as_utf8 << "\"];\n";
+ }
+ if (node.wildcard_edge_to != DOOMED) {
+ os << " " << i << " -> " << node.wildcard_edge_to << " [label=\"*\"];\n";
+ }
+ }
+ os << "}\n";
+}
+
+namespace {
+
+template <typename StateType>
+struct ExploreState {
+ using NodeIdAndExplored = std::pair<uint32_t, bool>;
+ using SparseExploredStates = vespalib::hash_map<StateType, NodeIdAndExplored, typename StateType::hash>;
+
+ uint32_t state_counter;
+ SparseExploredStates explored_states;
+
+ ExploreState();
+ ~ExploreState();
+
+ [[nodiscard]] SparseExploredStates::iterator node_of(const StateType& state) {
+ auto maybe_explored = explored_states.find(state);
+ if (maybe_explored != explored_states.end()) {
+ return maybe_explored;
+ }
+ uint32_t this_node = state_counter;
+ assert(state_counter < UINT32_MAX);
+ ++state_counter;
+ return explored_states.insert(std::make_pair(state, std::make_pair(this_node, false))).first; // not yet explored;
+ }
+
+ [[nodiscard]] bool already_explored(const SparseExploredStates::iterator& node) const noexcept {
+ return node->second.second;
+ }
+
+ void tag_as_explored(SparseExploredStates::iterator& node) noexcept {
+ node->second.second = true;
+ }
+};
+
+template <typename StateType>
+ExploreState<StateType>::ExploreState()
+ : state_counter(0),
+ explored_states()
+{}
+
+template <typename StateType>
+ExploreState<StateType>::~ExploreState() = default;
+
+template <typename Traits>
+class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase<Traits> {
+ using Base = DfaSteppingBase<Traits>;
+
+ using StateType = typename Base::StateType;
+ using TransitionsType = typename Base::TransitionsType;
+
+ using Base::_u32_str;
+ using Base::max_edits;
+ using Base::start;
+ using Base::match_edit_distance;
+ using Base::step;
+ using Base::is_match;
+ using Base::can_match;
+ using Base::transitions;
+public:
+ explicit ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str) noexcept
+ : DfaSteppingBase<Traits>(str)
+ {
+ assert(str.size() < UINT32_MAX / max_out_edges_per_node());
+ }
+
+ [[nodiscard]] static constexpr uint8_t max_out_edges_per_node() noexcept {
+ // Max possible out transition characters (2k+1) + one wildcard edge.
+ return diag(max_edits()) + 1;
+ }
+
+ [[nodiscard]] LevenshteinDfa build_dfa() const;
+};
+
+template <typename Traits>
+LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
+ auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>();
+ ExploreState<StateType> exp;
+ // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_
+ // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth.
+ // This does _not_ always hold, such as if you have A->B and A->C->B (i.e. both parent and
+ // grandparent have a transition to the same state), in which case B may be allocated before C.
+ std::queue<StateType> to_explore;
+ to_explore.push(start());
+ while (!to_explore.empty()) {
+ auto state = std::move(to_explore.front());
+ to_explore.pop();
+ auto this_node = exp.node_of(state); // note: invalidated by subsequent calls to node_of
+ if (exp.already_explored(this_node)) {
+ continue;
+ }
+ exp.tag_as_explored(this_node);
+ const auto this_node_idx = this_node->second.first;
+ dfa->ensure_node_array_large_enough_for_index(this_node_idx);
+ dfa->set_node_edit_distance(this_node_idx, match_edit_distance(state));
+ auto t = transitions(state);
+ for (uint32_t out_c : t.u32_chars()) {
+ auto new_state = step(state, out_c);
+ auto out_node = exp.node_of(new_state);
+ dfa->add_outgoing_edge(this_node_idx, out_node->second.first, out_c);
+ to_explore.push(std::move(new_state));
+ }
+ auto wildcard_state = step(state, WILDCARD);
+ if (can_match(wildcard_state)) {
+ auto out_node = exp.node_of(wildcard_state);
+ dfa->set_wildcard_edge(this_node_idx, out_node->second.first);
+ to_explore.push(std::move(wildcard_state));
+ } // else: don't bother
+ }
+ return LevenshteinDfa(std::move(dfa));
+}
+
+} // anon ns
+
+template <typename Traits>
+LevenshteinDfa ExplicitLevenshteinDfaBuilder<Traits>::build_dfa() const {
+ ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf);
+ return builder.build_dfa();
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
new file mode 100644
index 00000000000..8b9d2eddcac
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
@@ -0,0 +1,9 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "implicit_levenshtein_dfa.hpp"
+
+namespace vespalib::fuzzy {
+
+template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>;
+template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>;
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
new file mode 100644
index 00000000000..0846b95d135
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -0,0 +1,35 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "levenshtein_dfa.h"
+#include "unicode_utils.h"
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+template <typename Traits>
+class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl {
+ std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+public:
+ using MatchResult = LevenshteinDfa::MatchResult;
+
+ explicit ImplicitLevenshteinDfa(std::string_view str)
+ : ImplicitLevenshteinDfa(utf8_string_to_utf32(str))
+ {}
+
+ explicit ImplicitLevenshteinDfa(std::vector<uint32_t> str) noexcept
+ : _u32_str_buf(std::move(str))
+ {}
+
+ ~ImplicitLevenshteinDfa() override = default;
+
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
+
+ [[nodiscard]] size_t memory_usage() const noexcept override {
+ return _u32_str_buf.size() * sizeof(uint32_t);
+ }
+
+ void dump_as_graphviz(std::ostream& os) const override;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
new file mode 100644
index 00000000000..4ee468e424b
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -0,0 +1,121 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "dfa_stepping_base.h"
+#include "implicit_levenshtein_dfa.h"
+#include "match_algorithm.hpp"
+#include "sparse_state.h"
+#include <cassert>
+#include <stdexcept>
+
+namespace vespalib::fuzzy {
+
+// DfaMatcher adapter for implicit DFA implementation
+template <typename Traits>
+struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
+ using Base = DfaSteppingBase<Traits>;
+
+ using StateType = typename Base::StateType;
+ using EdgeType = uint32_t; // Just the raw u32 character value
+
+ using StateParamType = const StateType&;
+
+ using Base::_u32_str;
+ using Base::max_edits;
+ using Base::start;
+ using Base::match_edit_distance;
+ using Base::step;
+ using Base::can_wildcard_step;
+ using Base::is_match;
+ using Base::can_match;
+
+ explicit ImplicitDfaMatcher(std::span<const uint32_t> u32_str) noexcept
+ : Base(u32_str)
+ {}
+
+ // start, is_match, can_match, match_edit_distance are all provided by base type
+
+ template <typename F>
+ bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) {
+ for (uint32_t i = 0; i < state.size(); ++i) {
+ const auto idx = state.index(i);
+ if ((idx < _u32_str.size()) && f(_u32_str[idx])) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ template <typename F>
+ void for_each_char(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) {
+ for (uint32_t i = 0; i < state.size(); ++i) {
+ const auto idx = state.index(i);
+ if ((idx < _u32_str.size())) [[likely]] {
+ f(_u32_str[idx]);
+ }
+ }
+ }
+
+ bool has_explicit_higher_out_edge(const StateType& state, uint32_t ch) const noexcept {
+ return has_any_char_matching(state, [ch](uint32_t state_ch) noexcept {
+ return state_ch > ch;
+ });
+ }
+
+ bool has_higher_out_edge(const StateType& state, uint32_t mch) const noexcept {
+ return (has_explicit_higher_out_edge(state, mch) || can_wildcard_step(state));
+ }
+ StateType match_input(const StateType& state, uint32_t mch) const noexcept {
+ return step(state, mch);
+ }
+ bool valid_state(const StateType& state) const noexcept {
+ return !state.empty();
+ }
+ StateType match_wildcard(const StateType& state) const noexcept {
+ return step(state, WILDCARD);
+ }
+ bool has_exact_explicit_out_edge(const StateType& state, uint32_t ch) const noexcept {
+ return has_any_char_matching(state, [ch](uint32_t state_ch) noexcept {
+ return state_ch == ch;
+ });
+ }
+ EdgeType lowest_higher_explicit_out_edge(const StateType& state, uint32_t ch) const noexcept {
+ uint32_t min_ch = UINT32_MAX;
+ for_each_char(state, [ch, &min_ch](uint32_t state_ch) noexcept {
+ if ((state_ch > ch) && (state_ch < min_ch)) {
+ min_ch = state_ch;
+ }
+ });
+ return min_ch;
+ }
+ EdgeType smallest_explicit_out_edge(const StateType& state) const noexcept {
+ uint32_t min_ch = UINT32_MAX;
+ for_each_char(state, [&min_ch](uint32_t state_ch) noexcept {
+ min_ch = std::min(min_ch, state_ch);
+ });
+ return min_ch;
+ }
+ bool valid_edge(EdgeType edge) const noexcept {
+ return edge != UINT32_MAX;
+ }
+ uint32_t edge_to_u32char(EdgeType edge) const noexcept {
+ return edge;
+ }
+ StateType edge_to_state(const StateType& state, EdgeType edge) const noexcept {
+ return step(state, edge);
+ }
+};
+
+template <typename Traits>
+LevenshteinDfa::MatchResult
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
+ ImplicitDfaMatcher<Traits> matcher(_u32_str_buf);
+ return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
+}
+
+template <typename Traits>
+void ImplicitLevenshteinDfa<Traits>::dump_as_graphviz(std::ostream&) const {
+ throw std::runtime_error("Graphviz output not available for implicit Levenshtein DFA");
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
new file mode 100644
index 00000000000..e75ef8365bf
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -0,0 +1,83 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "explicit_levenshtein_dfa.h"
+#include "implicit_levenshtein_dfa.h"
+#include "levenshtein_dfa.h"
+#include <vespa/vespalib/util/stringfmt.h>
+#include <memory>
+
+namespace vespalib::fuzzy {
+
+LevenshteinDfa::LevenshteinDfa(std::unique_ptr<Impl> impl) noexcept
+ : _impl(std::move(impl))
+{}
+
+LevenshteinDfa::LevenshteinDfa(LevenshteinDfa&&) noexcept = default;
+LevenshteinDfa& LevenshteinDfa::operator=(LevenshteinDfa&&) noexcept = default;
+
+LevenshteinDfa::~LevenshteinDfa() = default;
+
+LevenshteinDfa::MatchResult
+LevenshteinDfa::match(std::string_view u8str, std::string* successor_out) const {
+ return _impl->match(u8str, successor_out);
+}
+
+size_t LevenshteinDfa::memory_usage() const noexcept {
+ return _impl->memory_usage();
+}
+
+void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const {
+ _impl->dump_as_graphviz(out);
+}
+
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, DfaType dfa_type) {
+ if (max_edits != 1 && max_edits != 2) {
+ throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits));
+ }
+ if (dfa_type == DfaType::Implicit) {
+ if (max_edits == 1) {
+ return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(target_string));
+ } else { // max_edits == 2
+ return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(target_string));
+ }
+ } else { // DfaType::Explicit
+ if (max_edits == 1) {
+ return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(target_string).build_dfa();
+ } else { // max_edits == 2
+ return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(target_string).build_dfa();
+ }
+ }
+
+}
+
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits) {
+ // TODO automatically select implementation based on target length/max edits?
+ // Suggestion:
+ // - Explicit DFA iff (k == 1 && |target| <= 256) || (k == 2 && |target| <= 64).
+ // - Implicit DFA otherwise.
+ // This keeps memory overhead < 64k and DFA construction time < 300 usec (measured on
+ // an M1 Pro; your mileage may vary etc).
+ // Ideally the implicit DFA would always be the fastest (or at least approximately as
+ // fast as the explicit DFA), but this is not yet the case.
+ return build(target_string, max_edits, DfaType::Implicit);
+}
+
+std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos) {
+ if (mos.matches()) {
+ os << "match(" << static_cast<int>(mos.edits()) << " edits)";
+ } else {
+ os << "mismatch";
+ }
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) {
+ if (dt == LevenshteinDfa::DfaType::Implicit) {
+ os << "Implicit";
+ } else {
+ assert(dt == LevenshteinDfa::DfaType::Explicit);
+ os << "Explicit";
+ }
+ return os;
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
new file mode 100644
index 00000000000..a26ccbe87ee
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -0,0 +1,244 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace vespalib::fuzzy {
+
+/**
+ * Levenshtein Deterministic Finite Automata (DFA)
+ *
+ * The Levenshtein distance (or edit distance) is the minimum number of edits (additions,
+ * deletions or substitutions) needed to transform a particular source string s to a
+ * particular target string t.
+ *
+ * Let m be the length of the source string and n be the length of the target string.
+ *
+ * The classic dynamic programming algorithm uses a n x m cost matrix and is therefore
+ * O(nm) in space and time. By observing that only 2 rows of the matrix are actually
+ * needed, this is commonly reduced to O(n) space complexity (still O(nm) time complexity).
+ * When the maximum number of allowed edits is constrained to k, some clever observations
+ * about the nature of the cost matrix allows for reducing the time complexity down to
+ * O(kn) (more specifically, O((2k+1) * n)). When k is fixed (e.g. k in {1, 2}), the
+ * time complexity simplifies down to O(n).
+ *
+ * This implements code for building and evaluating Levenshtein Deterministic Finite
+ * Automata, where the resulting DFA efficiently matches all possible source strings that
+ * can be transformed to the target string within k max edits. This allows for easy linear
+ * matching of strings.
+ *
+ * Inspiration:
+ * - http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata
+ * - https://julesjacobs.com/2015/06/17/disqus-levenshtein-simple-and-fast.html
+ *
+ * The latter in particular was a close inspiration for the sparse DFA state management.
+ *
+ * ====== Dictionary skipping via successor string generation ======
+ *
+ * Scanning for edit distance matches frequently takes place against a sorted dictionary.
+ * When matching using a DFA, in the case where the source string does _not_ match, we can
+ * generate the _successor_ string; the next matching string that is lexicographically
+ * _greater_ than the source string. This string has the invariant that there are no
+ * possibly matching strings within k edits ordered after the source string but before
+ * the successor.
+ *
+ * This lets us do possibly massive leaps forward in the dictionary, turning a dictionary
+ * scan into a sublinear operation.
+ *
+ * Note that the implemented successor algorithm is slightly different from that described
+ * in the above blog post. The implemented algorithm requires zero extra data structures
+ * than the DFA itself and the target string and tries to be extra clever with reducing
+ * the number of code point conversions required
+ *
+ * ====== Unicode support ======
+ *
+ * Matching and successor generation is fully Unicode-aware. All input strings are expected
+ * to be in UTF-8, and the generated successor is also encoded as UTF-8 (with some caveats;
+ * see the documentation for match()).
+ *
+ * Internally, matching is done on UTF-32 code points and the DFA itself is built around
+ * UTF-32. This is unlike Lucene, which converts a UTF-32 DFA to an equivalent UTF-8 DFA.
+ *
+ * ====== Memory usage ======
+ *
+ * There is always a baseline DFA memory usage O(n) in the target string, as the
+ * underlying DFA needs to convert the input UTF-8 string to explicit UTF-32 chars.
+ *
+ * Aside from the baseline, memory usage depends on whether an explicit or implicit DFA
+ * is used.
+ *
+ * ------ Explicit DFA ------
+ *
+ * The explicit DFA graph takes up quite a bit more memory than the original string
+ * representation (one reason is the use of UTF-32 characters under the hood).
+ *
+ * Expected upper bound memory usage for a string of length n with max edits k is
+ *
+ * (2k+1) * N(k) * n * W(k)
+ *
+ * where N(1) is expected to be 32 and N(2) is 48, W(1) is 1.34 and W(2) is 3.2 (empirically
+ * derived).
+ *
+ * Memory usage during building is higher due to keeping track of the set of generated
+ * states in a hash table, but still linear in input size. This extra memory is freed
+ * once building is complete.
+ *
+ * ------ Implicit DFA ------
+ *
+ * Implicit DFAs have a O(1) memory usage during evaluation, which all lives on the stack
+ * or in registers (this does not include the successor string, which is provided by the
+ * caller).
+ *
+ * Since the sparse state stepping is currently not as fast as explicit DFA node traversal,
+ * string matching is slower than with the explicit DFA.
+ *
+ * ====== In short ======
+ *
+ * - Immutable; build once, run many times.
+ * - Explicit DFA build time is amortized linear in target string size.
+ * - Implicit DFA build time is O(1) (aside from initial UTF-32 conversion)
+ * - Zero-allocation matching.
+ * - Matching takes in raw UTF-8 input, no need to pre-convert.
+ * - Streaming UTF-8 to UTF-32 conversion; fully unicode-aware (DFA uses UTF-32 code
+ * points internally).
+ * - If required, it's possible (but not currently implemented) to bake case
+ * insensitive matching semantics into the generated DFA itself.
+ * - Allows for dictionary forward-skipping via successor algorithm.
+ * - Amortized zero allocations for successor string building when reusing string
+ * between matches.
+ * - Successor string is generated in-place as UTF-8 and can be directly used as input
+ * to a byte-wise dictionary seek.
+ */
+class LevenshteinDfa {
+public:
+ class MatchResult {
+ uint8_t _max_edits;
+ uint8_t _edits;
+ public:
+ constexpr MatchResult(uint8_t max_edits, uint8_t edits) noexcept
+ : _max_edits(max_edits),
+ _edits(edits)
+ {}
+
+ static constexpr MatchResult make_match(uint8_t max_edits, uint8_t edits) noexcept {
+ return {max_edits, edits};
+ }
+
+ static constexpr MatchResult make_mismatch(uint8_t max_edits) noexcept {
+ return {max_edits, static_cast<uint8_t>(max_edits + 1)};
+ }
+
+ [[nodiscard]] constexpr bool matches() const noexcept { return _edits <= _max_edits; }
+ [[nodiscard]] constexpr uint8_t edits() const noexcept { return _edits; }
+ [[nodiscard]] constexpr uint8_t max_edits() const noexcept { return _max_edits; }
+ };
+
+ struct Impl {
+ virtual ~Impl() = default;
+ [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string* successor_out) const = 0;
+ [[nodiscard]] virtual size_t memory_usage() const noexcept = 0;
+ virtual void dump_as_graphviz(std::ostream& out) const = 0;
+ };
+
+private:
+ std::unique_ptr<Impl> _impl;
+public:
+ explicit LevenshteinDfa(std::unique_ptr<Impl> impl) noexcept;
+ LevenshteinDfa(LevenshteinDfa&&) noexcept;
+ LevenshteinDfa& operator=(LevenshteinDfa&&) noexcept;
+ LevenshteinDfa(const LevenshteinDfa&) = delete;
+ LevenshteinDfa& operator=(const LevenshteinDfa&) = delete;
+ ~LevenshteinDfa();
+
+ /**
+ * Attempts to match the source string `source` with the target string this DFA was
+ * built with, emitting a successor string on mismatch if `successor_out` != nullptr.
+ *
+ * `source` must not contain any null UTF-8 chars.
+ *
+ * Match case:
+ * Iff `source` is _within_ the maximum edit distance, returns a MatchResult with
+ * matches() == true and edits() == the actual edit distance. If `successor_out`
+ * is not nullptr, the string pointed to is _not_ modified.
+ *
+ * Mismatch case:
+ * Iff `source` is _beyond_ the maximum edit distance, returns a MatchResult with
+ * matches() == false.
+ *
+ * Iff `successor_out` is not nullptr, the following holds:
+ * - `successor_out` is modified to contain the next (in byte-wise ordering) possible
+ * _matching_ string S so that there exists no other matching string S' that is
+ * greater than `source` but smaller than S.
+ * - `successor_out` contains UTF-8 bytes that are within what UTF-8 can legally
+ * encode in bitwise form, but the _code points_ they encode may not be valid.
+ * In particular, surrogate pair ranges and U+10FFFF+1 may be encoded, neither of
+ * which are valid UTF-8.
+ *
+ * It is expected that the consumer of `successor_out` is only interested in the
+ * memcmp()-ordering of strings and not whether they are technically valid Unicode.
+ * This should be the case for low-level dictionary data structures etc.
+ *
+ * Memory allocation:
+ * This function does not directly or indirectly allocate any heap memory if either:
+ *
+ * - the input string is within the max edit distance, or
+ * - `successor_out` is nullptr, or
+ * - `successor_out` has sufficient capacity to hold the generated successor
+ *
+ * By reusing the successor string across many calls, this therefore amortizes memory
+ * allocations down to near zero per invocation.
+ */
+ [[nodiscard]] MatchResult match(std::string_view source, std::string* successor_out) const;
+
+ /**
+ * Returns how much memory is used by the underlying DFA representation, in bytes.
+ */
+ [[nodiscard]] size_t memory_usage() const noexcept;
+
+ enum class DfaType {
+ Implicit,
+ Explicit
+ };
+
+ /**
+ * Builds and returns a Levenshtein DFA that matches all strings within `max_edits`
+ * edits of `target_string`. The type of DFA returned is specified by dfa_type.
+ *
+ * `max_edits` must be in {1, 2}. Throws std::invalid_argument if outside range.
+ *
+ * `target_string` must not contain any null UTF-8 chars.
+ */
+ [[nodiscard]] static LevenshteinDfa build(std::string_view target_string,
+ uint8_t max_edits,
+ DfaType dfa_type);
+
+ /**
+ * Same as build() but currently always returns an implicit DFA.
+ */
+ [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits);
+
+ /**
+ * Dumps the DFA as a Graphviz graph in text format to the provided output stream.
+ *
+ * Note: Only supported for _explicit_ DFAs. Trying to call this function on an implicit
+ * DFA will throw a std::runtime_error, as there is no concrete underlying graph
+ * structure to dump.
+ *
+ * Note that only _matching_ state transitions are present in the DFA, and therefore only
+ * such transitions are present in the generated graph. Overall this makes the graph for
+ * longer strings much more manageable, as the number of out-edges from a particular depth
+ * in the graph depends on the max number of edits and not on the length of the string
+ * itself. Otherwise, you'd have a whole bunch of nodes with out-edges to the same terminal
+ * non-matching state node.
+ */
+ void dump_as_graphviz(std::ostream& out) const;
+};
+
+std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos);
+std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt);
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
new file mode 100644
index 00000000000..206b69f8ebe
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -0,0 +1,291 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "dfa_matcher.h"
+#include "levenshtein_dfa.h"
+#include "unicode_utils.h"
+#include <vespa/vespalib/text/utf8.h>
+#include <cassert>
+#include <concepts>
+
+namespace vespalib::fuzzy {
+
+/**
+ * Implementation of algorithm for linear-time k-max edits string matching and successor
+ * string generation over an abstract DFA representation.
+ *
+ * The implementation is agnostic to how the underlying DFA is implemented, but requires
+ * an appropriate adapter that satisfies the DfaMatcher concept contracts.
+ */
+template <uint8_t MaxEdits>
+struct MatchAlgorithm {
+ using MatchResult = LevenshteinDfa::MatchResult;
+
+ static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
+
+ /**
+ * Matches UTF-8 source string `source` against the target DFA, optionally generating
+ * the successor string iff the source string is not within the maximum number of edits
+ * of the target string.
+ *
+ * The actual match loop is very simple: we try to match the DFA as far as we can
+ * before either consuming all input (source string) characters or ending up in a non-
+ * matching state before we have consumed all input. In the former case, we may be in
+ * a matching state (consider matching "foo" with the target string "food"; after
+ * consuming all input we'll be in a matching state with 1 edit). In the latter case,
+ * the input string cannot possible match.
+ *
+ * If we end up in a matching state, all is well. We simply return a MatchResult with
+ * the number of edits the state represents.
+ *
+ * The interesting bit happens the string does _not_ match and we are asked to provide a
+ * _successor_ string that _does_ match and is strictly greater in lexicographic order.
+ *
+ * We lean on some core invariants:
+ *
+ * - The m x n (|source| x |target|) Levenshtein matrix provides, for any m[i, j] with
+ * i in [1, m], j in [1, n], the _minimum possible_ number of edits that can transform
+ * the source string prefix of length `i` to the target string prefix of length `j`.
+ * This means there is no way of transforming said source prefix using _fewer_ edits.
+ *
+ * - Any given DFA state corresponds to a unique row in the Levenshtein matrix, thus
+ * transitively inheriting the invariants of the matrix row elements themselves, such
+ * as representing the minimum number of edits.
+ *
+ * We have two mismatch cases:
+ *
+ * 1. We've matched the entire source string without ending in an accepting state.
+ *
+ * This can only happen if the input is a (possibly edited) prefix of the target string.
+ * Any and all _longer_ strings with this prefix is inherently lexicographically greater,
+ * so we emit the smallest possible suffix that turns prefix || suffix into a matching
+ * string.
+ *
+ * See emit_smallest_matching_suffix() for details.
+ *
+ * 2. We've matched a prefix of the source string without ending in an accepting state.
+ *
+ * This case is trickier than when the entire source string is a prefix, as we cannot
+ * just emit a suffix to the source to create a matching, lexicographically greater string.
+ *
+ * Consider source "foxx" and target "food". There exists no suffix S in "food" that can
+ * turn "foxx" || S into a matching string within k=1 edits.
+ *
+ * So we have to backtrack to somewhere.
+ *
+ * That "somewhere" is the state that maximizes the size of the source prefix while
+ * allowing us to emit a greater suffix.
+ *
+ * For each state we visit, we check if there exists at least one higher out edge than the
+ * one taken out from that state (this is possibly a wildcard edge). If one exists, we
+ * copy the state to `last_state_with_higher_out` and remember the state's source string
+ * prefix as well as the source string character that transitions us away from the state
+ * (this will be our candidate for building a greater suffix).
+ *
+ * When we fail to match the entire source string, we know that last_state_with_higher_out
+ * represents the last possible branching point (and therefore the longest prefix) where
+ * we can substitute in or insert a higher character, in turn creating a greater suffix.
+ *
+ * Proof by contradiction: let `last_state_with_higher_out` be S and assume there exists
+ * a state S' that has a greater source string prefix than S while still allowing for
+ * emitting a lexicographically greater suffix that is within max edits k. We terminate
+ * the match loop once can_match(X) is false for any state X, where X subsumes S by
+ * definition. For S' to exist, it must be possible for a transition to exist from X to
+ * a later state that can have a higher out edge. However, edit distance costs can
+ * never decrease, only stay constant (with matching substitutions) or increase (with
+ * insertions, deletions or non-matching substitutions), so it's impossible to follow
+ * an out-edge from X to any later potentially matching state. Thus, S' can not exist
+ * and we have a contradiction.
+ *
+ * Since we want to generate the smallest possible larger string that matches, we ideally
+ * want to emit a character that is +1 of the source character after the shared prefix.
+ * This is using the "higher out"-character we remembered earlier. We do this if we have
+ * a wildcard out edge (or if there exists an explicit out-edge for value char+1).
+ * Otherwise, we have to follow the highest explicitly present out-edge.
+ *
+ * Once we have emitted one single character that gets us lexicographically higher than
+ * the source string, we then emit the smallest possible suffix to this. This uses the
+ * same minimal suffix generation logic as mismatch case 1).
+ *
+ * See `backtrack_and_emit_greater_suffix()` for details.
+ *
+ * Example:
+ * (This is easiest to follow by looking at examples/food_dfa.svg)
+ *
+ * Source "foxx", target "food" and k=1:
+ *
+ * After matching "fo" with 0 edits we reach a state with out-edges {d, o, *}. This state
+ * has an implicitly higher out-edge (*) and we remember it and the char 'x' for later.
+ * Edge 'x' can only happen via *, so we take that path.
+ *
+ * After matching "fox" with 1 edit we reach a state with out-edges {d, o}. There is
+ * no out-edge for 'x' and the state is not a matching state, so we need to backtrack
+ * and generate a successor.
+ *
+ * We backtrack to the state representing "fo" and emit it as a successor prefix. We
+ * observe that this state has a wildcard out-edge and emit 'x'+1 == 'y' to the successor
+ * string and continue with emitting the smallest suffix. We now have a successor
+ * prefix of "foy", with which we reach the same logical state as we did with "fox"
+ * previously. The smallest out-edge here is 'd', so we take it. This leaves us in an
+ * accepting (matching) state, so suffix generation completes.
+ *
+ * "foxx" -> "foyd"
+ *
+ * Note that it's possible for the prefix to be empty, which results in a successor
+ * that has nothing in common with the source altogether.
+ * Example: "gp" -> "hfood" (+1 char value case)
+ *
+ * Performance note:
+ * Both the input and successor output strings are in UTF-8 format. To avoid doing
+ * duplicate work, we keep track of the byte length of the string prefix that will be
+ * part of the successor and simply copy it verbatim instead of building the string
+ * from converted UTF-32 -> UTF-8 chars as we go.
+ *
+ * TODO we could probably also optimize the smallest suffix generation with this when
+ * we know we can no longer insert any smaller char substitutions and the only way
+ * to complete the string is to emit it verbatim.
+ * - To do this we'd need both the original UTF-8 target string as well as a
+ * secondary vector that maps u32 character index to the corresponding UTF-8 index.
+ * Both trivial to get as part of DFA initialization.
+ */
+ template <DfaMatcher Matcher>
+ static MatchResult match(const Matcher& matcher,
+ std::string_view source,
+ std::string* successor_out)
+ {
+ using StateType = typename Matcher::StateType;
+ vespalib::Utf8Reader u8_reader(source.data(), source.size());
+ uint32_t n_prefix_u8_bytes = 0;
+ uint32_t char_after_prefix = 0;
+ StateType last_state_with_higher_out = StateType{};
+
+ StateType state = matcher.start();
+ while (u8_reader.hasMore()) {
+ const auto u8_pos_before_char = u8_reader.getPos();
+ const uint32_t mch = u8_reader.getChar();
+ if (successor_out && matcher.has_higher_out_edge(state, mch)) {
+ last_state_with_higher_out = state;
+ n_prefix_u8_bytes = u8_pos_before_char;
+ char_after_prefix = mch;
+ }
+ auto maybe_next = matcher.match_input(state, mch);
+ if (matcher.can_match(maybe_next)) {
+ state = maybe_next;
+ } else {
+ // Can never match; find the successor if requested
+ if (successor_out) {
+ *successor_out = source.substr(0, n_prefix_u8_bytes);
+ assert(matcher.valid_state(last_state_with_higher_out));
+ backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out,
+ char_after_prefix, *successor_out);
+ }
+ return MatchResult::make_mismatch(max_edits());
+ }
+ }
+ const auto edits = matcher.match_edit_distance(state);
+ if (edits <= max_edits()) {
+ return MatchResult::make_match(max_edits(), edits);
+ }
+ if (successor_out) {
+ *successor_out = source;
+ emit_smallest_matching_suffix(matcher, state, *successor_out);
+ }
+ return MatchResult::make_mismatch(max_edits());
+ }
+
+ /**
+ * Instantly backtrack to the last possible branching point in the DFA where we can
+ * choose some higher outgoing edge character value and still match the DFA. If the node
+ * has a wildcard edge, we can bump the input char by one and generate the smallest
+ * possible matching suffix to that. Otherwise, choose the smallest out edge that is
+ * greater than the input character at that location and _then_ emit the smallest
+ * matching prefix.
+ *
+ * precondition: `last_node_with_higher_out` has either a wildcard edge or a char match
+ * edge that compares greater than `input_at_branch`.
+ */
+ template <DfaMatcher Matcher>
+ static void backtrack_and_emit_greater_suffix(
+ const Matcher& matcher,
+ typename Matcher::StateParamType last_state_with_higher_out,
+ const uint32_t input_at_branch,
+ std::string& successor)
+ {
+ auto wildcard_state = matcher.match_wildcard(last_state_with_higher_out);
+ if (matcher.can_match(wildcard_state)) {
+ // `input_at_branch` may be U+10FFFF, with +1 being outside legal Unicode _code point_
+ // range but _within_ what UTF-8 can technically _encode_.
+ // We assume that successor-consumers do not care about anything except byte-wise
+ // ordering. This is similar to what RE2's PossibleMatchRange emits to represent a
+ // UTF-8 upper bound, so not without precedent.
+ // If the resulting character corresponds to an existing out-edge we _must_ take it
+ // instead of the wildcard edge, or we'll end up in the wrong state.
+ const auto next_char = input_at_branch + 1;
+ if (!matcher.has_exact_explicit_out_edge(last_state_with_higher_out, next_char)) {
+ append_utf32_char_as_utf8(successor, next_char);
+ emit_smallest_matching_suffix(matcher, wildcard_state, successor);
+ return;
+ } // else: handle exact match below (it will be found as the first higher out edge)
+ }
+ const auto first_highest_edge = matcher.lowest_higher_explicit_out_edge(last_state_with_higher_out, input_at_branch);
+ assert(matcher.valid_edge(first_highest_edge));
+ append_utf32_char_as_utf8(successor, matcher.edge_to_u32char(first_highest_edge));
+ emit_smallest_matching_suffix(matcher, matcher.edge_to_state(last_state_with_higher_out, first_highest_edge), successor);
+ }
+
+ /**
+ * The smallest possible suffix is generated by following the smallest out-edge per state,
+ * until we reach a state that is a match. It is possible that the smallest out edge is a
+ * "wildcard" edge (our terminology), which means that we can insert/substitute an arbitrary
+ * character and still have `can_match(resulting state)` be true. In this case we emit the
+ * smallest possible non-null UTF-8 character (0x01).
+ *
+ * Examples:
+ * (These are easiest to follow by looking at examples/food_dfa.svg)
+ *
+ * Source "fo", target "food" and k=1:
+ *
+ * After matching "fo" we have 1 edit to spare. The smallest valid, non-empty UTF-8 suffix
+ * to this string must necessarily begin with 0x01, so that's what we emit. The smallest
+ * edge we can follow from the resulting state is 'd', and that is a accepting (matching)
+ * state.
+ *
+ * "fo" -> "fo\x01d"
+ *
+ * Source "fx", target "food" and k=1:
+ *
+ * After matching "fx" we have no edits to spare. The smallest character reachable from
+ * the state is 'o' (in fact, it is the only out edge available since we're down to zero
+ * available edits). The next state has an out-edge to 'd' and 'o', and we choose 'd'
+ * since it is smallest. This leaves us in an accepting (matching) state and we terminate
+ * the loop.
+ *
+ * "fx" -> "fxod"
+ */
+ // TODO consider variant for only emitting _prefix of suffix_ to avoid having to generate
+ // the full string? Won't generate a matching string, but will be lexicographically greater.
+ template <DfaMatcher Matcher>
+ static void emit_smallest_matching_suffix(
+ const Matcher& matcher,
+ typename Matcher::StateParamType from,
+ std::string& str)
+ {
+ auto state = from;
+ while (!matcher.is_match(state)) {
+ // If we can take a wildcard path, emit the smallest possible valid UTF-8 character (0x01).
+ // Otherwise, find the smallest char that can eventually lead us to a match.
+ auto wildcard_state = matcher.match_wildcard(state);
+ if (matcher.can_match(wildcard_state)) {
+ str += '\x01';
+ state = wildcard_state;
+ } else {
+ const auto smallest_out_edge = matcher.smallest_explicit_out_edge(state);
+ assert(matcher.valid_edge(smallest_out_edge));
+ append_utf32_char_as_utf8(str, matcher.edge_to_u32char(smallest_out_edge));
+ state = matcher.edge_to_state(state, smallest_out_edge);
+ }
+ }
+ }
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
new file mode 100644
index 00000000000..40cfa5e6409
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
@@ -0,0 +1,175 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <ostream>
+#include <span>
+#include <xxh3.h> // TODO factor out?
+
+namespace vespalib::fuzzy {
+
+// Sentinel U32 char for state stepping that cannot match any target string characters
+constexpr const uint32_t WILDCARD = UINT32_MAX;
+
+/**
+ * diag(n) is the width of the diagonal of the cost matrix that can possibly be
+ * within k edits. This means that for a fixed k, it suffices to maintain state
+ * for up to and including diag(k) consecutive cells for any given matrix row.
+ */
+constexpr inline uint8_t diag(uint8_t k) noexcept {
+ return k*2 + 1;
+}
+
+template <uint8_t MaxEdits>
+struct FixedSparseState {
+private:
+ static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2);
+
+ std::array<uint32_t, diag(MaxEdits)> indices;
+ std::array<uint8_t, diag(MaxEdits)> costs; // elems are 1-1 with indices vector
+ uint8_t sz;
+public:
+ constexpr FixedSparseState() noexcept : indices(), costs(), sz(0) {}
+
+ [[nodiscard]] constexpr bool empty() const noexcept {
+ return (sz == 0);
+ }
+
+ [[nodiscard]] constexpr uint32_t size() const noexcept {
+ return sz;
+ }
+
+ [[nodiscard]] constexpr uint32_t index(uint32_t entry_idx) const noexcept {
+ return indices[entry_idx];
+ }
+
+ [[nodiscard]] constexpr uint8_t cost(uint32_t entry_idx) const noexcept {
+ return costs[entry_idx];
+ }
+
+ // Precondition: !empty()
+ [[nodiscard]] constexpr uint32_t last_index() const noexcept {
+ return indices[sz - 1];
+ }
+
+ // Precondition: !empty()
+ [[nodiscard]] constexpr uint8_t last_cost() const noexcept {
+ return costs[sz - 1];
+ }
+
+ void append(uint32_t index, uint8_t cost) noexcept {
+ assert(sz < diag(MaxEdits));
+ indices[sz] = index;
+ costs[sz] = cost;
+ ++sz;
+ }
+
+ constexpr bool operator==(const FixedSparseState& rhs) const noexcept {
+ if (sz != rhs.sz) {
+ return false;
+ }
+ return (std::equal(indices.begin(), indices.begin() + sz, rhs.indices.begin()) &&
+ std::equal(costs.begin(), costs.begin() + sz, rhs.costs.begin()));
+ }
+
+ struct hash {
+ size_t operator()(const FixedSparseState& s) const noexcept {
+ static_assert(std::is_same_v<uint32_t, std::decay_t<decltype(s.indices[0])>>);
+ static_assert(std::is_same_v<uint8_t, std::decay_t<decltype(s.costs[0])>>);
+ // FIXME GCC 12.2 worse-than-useless(tm) warning false positives :I
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+ return (XXH3_64bits(s.indices.data(), s.sz * sizeof(uint32_t)) ^
+ XXH3_64bits(s.costs.data(), s.sz));
+#pragma GCC diagnostic pop
+ }
+ };
+};
+
+/**
+ * Prints sparse states as a single matrix row. Columns prior to any state index
+ * are printed explicitly as '-' characters to make states line up when printed.
+ *
+ * Example output for the state (2:1, 3:1):
+ *
+ * [-, -, 1, 1]
+ *
+ * Only meant as a debugging aid during development, as states with high indices
+ * will emit very large strings.
+ */
+template <uint8_t MaxEdits> [[maybe_unused]]
+std::ostream& operator<<(std::ostream& os, const FixedSparseState<MaxEdits>& s) {
+ os << "[";
+ size_t last_idx = 0;
+ for (size_t i = 0; i < s.size(); ++i) {
+ if (i != 0) {
+ os << ", ";
+ }
+ for (size_t j = last_idx; j < s.indices[i]; ++j) {
+ os << "-, ";
+ }
+ last_idx = s.indices[i] + 1;
+ os << static_cast<uint32_t>(s.costs[i]);
+ }
+ os << "]";
+ return os;
+}
+
+template <uint8_t MaxEdits>
+struct FixedMaxEditsTransitions {
+ static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2);
+
+ std::array<uint32_t, diag(MaxEdits)> out_u32_chars;
+ uint8_t size;
+
+ constexpr FixedMaxEditsTransitions() noexcept : out_u32_chars(), size(0) {}
+
+ [[nodiscard]] constexpr bool has_char(uint32_t u32ch) const noexcept {
+ for (uint8_t i = 0; i < size; ++i) {
+ if (out_u32_chars[i] == u32ch) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void add_char(uint32_t u32ch) noexcept {
+ if (!has_char(u32ch)) {
+ assert(size < diag(MaxEdits));
+ out_u32_chars[size] = u32ch;
+ ++size;
+ }
+ }
+
+ constexpr std::span<const uint32_t> u32_chars() const noexcept {
+ return {out_u32_chars.begin(), out_u32_chars.begin() + size};
+ }
+
+ constexpr std::span<uint32_t> u32_chars() noexcept {
+ return {out_u32_chars.begin(), out_u32_chars.begin() + size};
+ }
+
+ void sort() noexcept {
+ // TODO use custom sorting networks for fixed array sizes <= 5?
+ // FIXME GCC 12.2 worse-than-useless(tm) warning false positives :I
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+ std::sort(out_u32_chars.begin(), out_u32_chars.begin() + size);
+#pragma GCC diagnostic pop
+ }
+};
+
+template <uint8_t MaxEdits>
+struct FixedMaxEditDistanceTraits {
+ static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2);
+ using StateType = FixedSparseState<MaxEdits>;
+ using TransitionsType = FixedMaxEditsTransitions<MaxEdits>;
+ constexpr static uint8_t max_edits() noexcept {
+ return MaxEdits;
+ }
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
new file mode 100644
index 00000000000..648be234562
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
@@ -0,0 +1,108 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "unicode_utils.h"
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <stdexcept>
+
+namespace vespalib::fuzzy {
+
+std::vector<uint32_t> utf8_string_to_utf32(std::string_view str) {
+ vespalib::stringref ch_str(str.data(), str.size());
+ vespalib::Utf8Reader utf8_reader(ch_str);
+ std::vector<uint32_t> u32ret;
+ u32ret.reserve(str.size()); // Will over-allocate for all non-ASCII
+ while (utf8_reader.hasMore()) {
+ u32ret.emplace_back(utf8_reader.getChar());
+ }
+ return u32ret;
+}
+
+std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str) {
+ return utf8_string_to_utf32(std::string_view(reinterpret_cast<const char*>(u8str.data()), u8str.size()));
+}
+
+[[noreturn]] void throw_bad_code_point(uint32_t codepoint) __attribute__((noinline));
+[[noreturn]] void throw_bad_code_point(uint32_t codepoint) {
+ throw std::invalid_argument(make_string("invalid UTF-32 codepoint: U+%04X (%u)", codepoint, codepoint));
+}
+
+namespace {
+
+/**
+ * Encodes a single UTF-32 `codepoint` to a 1-4 byte UTF-8 sequence.
+ * `
+ * `u8buf` must point to a buffer with at least 4 writable bytes.
+ *
+ * Returns the number of bytes written.
+ *
+ * See comments on append_utf32_char_as_utf8() as to why this is not a generic UTF-8
+ * encoding function that can be used in all possible scenarios.
+ */
+[[nodiscard]] uint8_t encode_utf8_char(uint32_t codepoint, unsigned char* u8buf) {
+ constexpr const uint8_t low_6bits_mask = 0x3F;
+
+ // Yanked and modified from utf8.cpp:
+ if (codepoint < 0x80) {
+ u8buf[0] = (char) codepoint;
+ return 1;
+ } else if (codepoint < 0x800) {
+ char low6 = (codepoint & low_6bits_mask);
+ low6 |= 0x80;
+ codepoint >>= 6;
+ char first5 = codepoint;
+ first5 |= 0xC0;
+ u8buf[0] = first5;
+ u8buf[1] = low6;
+ return 2;
+ } else if (codepoint < 0x10000) {
+ char low6 = (codepoint & low_6bits_mask);
+ low6 |= 0x80;
+
+ codepoint >>= 6;
+ char mid6 = (codepoint & low_6bits_mask);
+ mid6 |= 0x80;
+
+ codepoint >>= 6;
+ char first4 = codepoint;
+ first4 |= 0xE0;
+
+ u8buf[0] = first4;
+ u8buf[1] = mid6;
+ u8buf[2] = low6;
+ return 3;
+ } else if (codepoint <= 0x110000) { // Explicitly _include_ U+10FFFF + 1!
+ char low6 = (codepoint & low_6bits_mask);
+ low6 |= 0x80;
+
+ codepoint >>= 6;
+ char mid6 = (codepoint & low_6bits_mask);
+ mid6 |= 0x80;
+
+ codepoint >>= 6;
+ char hi6 = (codepoint & low_6bits_mask);
+ hi6 |= 0x80;
+
+ codepoint >>= 6;
+ char first3 = codepoint;
+ first3 |= 0xF0;
+
+ u8buf[0] = first3;
+ u8buf[1] = hi6;
+ u8buf[2] = mid6;
+ u8buf[3] = low6;
+ return 4;
+ } else {
+ throw_bad_code_point(codepoint);
+ }
+}
+
+} // anon ns
+
+// TODO optimize inlined in header for case where u32_char is < 0x80?
+void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char) {
+ unsigned char u8buf[4];
+ uint8_t u8bytes = encode_utf8_char(u32_char, u8buf);
+ out_str.append(reinterpret_cast<const char*>(u8buf), u8bytes);
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
new file mode 100644
index 00000000000..8627b01ff6a
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
@@ -0,0 +1,33 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+std::vector<uint32_t> utf8_string_to_utf32(std::string_view str);
+
+std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str);
+
+/**
+ * Encodes a single UTF-32 codepoint `u32_char` to a 1-4 byte UTF-8 sequence and
+ * appends it to `out_str.`
+ *
+ * Note that this will happily encode code points that aren't technically part of
+ * the valid UTF-8 range, but which will still be correct in memcmp() byte-wise
+ * ordering, which is the API contract we expose.
+ *
+ * In particular, this includes:
+ * - high/low surrogate ranges U+D800 through U+DFFF (surrogate pairs not allowed
+ * in UTF-8)
+ * - U+10FFFF + 1 (outside max code point range by one)
+ *
+ * ... So don't copy this function for use as a general UTF-8 emitter, as it is not
+ * _technically_ conformant!
+ */
+void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char);
+
+}