summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--vespalib/src/tests/fuzzy/CMakeLists.txt9
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp507
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt10
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h70
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h299
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg286
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp11
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h147
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp228
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp9
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h35
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp121
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp83
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h244
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp291
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/sparse_state.h175
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp108
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h33
18 files changed, 2663 insertions, 3 deletions
diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt
index bc48e775711..00a89d0a604 100644
--- a/vespalib/src/tests/fuzzy/CMakeLists.txt
+++ b/vespalib/src/tests/fuzzy/CMakeLists.txt
@@ -16,3 +16,12 @@ vespa_add_executable(vespalib_levenshtein_distance_test_app TEST
GTest::GTest
)
vespa_add_test(NAME vespalib_levenshtein_distance_test_app COMMAND vespalib_levenshtein_distance_test_app)
+
+vespa_add_executable(vespalib_levenshtein_dfa_test_app TEST
+ SOURCES
+ levenshtein_dfa_test.cpp
+ DEPENDS
+ vespalib
+ GTest::GTest
+)
+vespa_add_test(NAME vespalib_levenshtein_dfa_test_app COMMAND vespalib_levenshtein_dfa_test_app)
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
new file mode 100644
index 00000000000..6966fd0b703
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -0,0 +1,507 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
+#include <vespa/vespalib/fuzzy/dfa_stepping_base.h>
+#include <vespa/vespalib/fuzzy/unicode_utils.h>
+#include <vespa/vespalib/fuzzy/levenshtein_distance.h> // For benchmarking purposes
+#include <vespa/vespalib/util/benchmark_timer.h>
+#include <charconv>
+#include <concepts>
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <string_view>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using namespace vespalib::fuzzy;
+namespace fs = std::filesystem;
+
+static std::string benchmark_dictionary;
+
+struct LevenshteinDfaTest : TestWithParam<LevenshteinDfa::DfaType> {
+
+ static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); }
+
+ static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+ auto dfa_lhs = LevenshteinDfa::build(left, threshold, dfa_type());
+ auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
+
+ auto dfa_rhs = LevenshteinDfa::build(right, threshold, dfa_type());
+ auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
+
+ EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
+ if (maybe_match_lhs.matches()) {
+ EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits());
+ return {maybe_match_lhs.edits()};
+ }
+ return std::nullopt;
+ }
+
+ static std::optional<uint32_t> calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold) {
+ std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size());
+ std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size());
+ return calculate(lhs_ch, rhs_ch, threshold);
+ }
+
+};
+
+INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+ LevenshteinDfaTest,
+ Values(LevenshteinDfa::DfaType::Explicit,
+ LevenshteinDfa::DfaType::Implicit),
+ PrintToStringParamName());
+
+// Same as existing non-DFA Levenshtein tests, but with some added instantiations
+// for smaller max distances.
+TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) {
+ EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0});
+ for (auto max : {1, 2}) {
+ EXPECT_EQ(calculate("abc", "ab1", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("abc", "1bc", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("abc", "a1c", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("abc", "ab", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate("a", "", max), std::optional{1}) << max;
+ }
+ EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("cd", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("ad", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "a12", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+ EXPECT_EQ(calculate("ab", "", 1), std::nullopt);
+ EXPECT_EQ(calculate("ab", "", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "", 2), std::nullopt);
+ EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+}
+
+TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
+ // Each hiragana/katakana/kanji corresponds to multiple (3) UTF-8 chars but a single UTF-32 code point.
+ EXPECT_EQ(calculate(u8"猫", u8"猫", 2), std::optional{0});
+ EXPECT_EQ(calculate(u8"猫", u8"犬", 2), std::optional{1});
+ EXPECT_EQ(calculate(u8"猫と犬", u8"犬と猫", 2), std::optional{2});
+ EXPECT_EQ(calculate(u8"猫は好き", u8"犬が好き", 2), std::optional{2});
+ EXPECT_EQ(calculate(u8"カラオケ", u8"カラオケ", 2), std::optional{0});
+ EXPECT_EQ(calculate(u8"カラオケ", u8"カラoケ", 2), std::optional{1});
+ EXPECT_EQ(calculate(u8"カラオケ", u8"カraオケ", 2), std::optional{2});
+ EXPECT_EQ(calculate(u8"kaラオケ", u8"カラオケ", 2), std::optional{2});
+ EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt);
+}
+
+void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) {
+ std::string successor;
+ auto m = dfa.match(source, &successor);
+ if (m.matches()) {
+ FAIL() << "Expected '" << source << "' to emit a successor, but it "
+ << "matched with " << static_cast<uint32_t>(m.edits())
+ << " edits (of max " << static_cast<uint32_t>(m.max_edits()) << " edits)";
+ }
+ EXPECT_EQ(successor, expected_successor);
+ EXPECT_TRUE(dfa.match(successor, nullptr).matches());
+}
+
+TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) {
+ auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+
+ test_dfa_successor(dfa, "", "\x01""food");
+ test_dfa_successor(dfa, "faa", "faod");
+ test_dfa_successor(dfa, "fooooo", "foop");
+ test_dfa_successor(dfa, "ooof", "pfood");
+ test_dfa_successor(dfa, "fo", "fo\x01""d");
+ test_dfa_successor(dfa, "oo", "ood");
+ test_dfa_successor(dfa, "ooo", "oood");
+ test_dfa_successor(dfa, "foh", "fohd");
+ test_dfa_successor(dfa, "foho", "fohod");
+ test_dfa_successor(dfa, "foxx", "foyd");
+ test_dfa_successor(dfa, "xfa", "xfood");
+ test_dfa_successor(dfa, "gg", "good");
+ test_dfa_successor(dfa, "gp", "hfood");
+ test_dfa_successor(dfa, "ep", "f\x01""od");
+ test_dfa_successor(dfa, "hfoodz", "hood");
+ test_dfa_successor(dfa, "aooodz", "bfood");
+
+ // Also works with Unicode
+ // 2 chars
+ test_dfa_successor(dfa, "\xc3\x86""x", // "Æx"
+ "\xc3\x87""food"); // "Çfood"
+ // 3 chars
+ test_dfa_successor(dfa, "\xe7\x8c\xab""\xe3\x81\xaf", // "猫は"
+ "\xe7\x8c\xac""food"); // "猬food"
+ // 4 chars
+ test_dfa_successor(dfa, "\xf0\x9f\xa4\xa9""abc", // <starry eyed emoji>abc
+ "\xf0\x9f\xa4\xa9""food"); // <starry eyed emoji>food
+
+ // Note that as a general rule, emojis are fickle beasts to deal with since a single
+ // emoji often takes up multiple code points, which we consider separate characters
+ // but a user sees as a single actual rendered glyph.
+ // Multi-code point character edit distance support is left as an exercise for the reader :D
+}
+
+TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) {
+ auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+ // The successor string must be lexicographically larger than the input string.
+ // In the presence of a wildcard output edge we handle this by increase the input
+ // character by 1 and encoding it back as UTF-8.
+ // It is possible (though arguably very unlikely) that the input character is
+ // U+10FFFF, which is the maximum valid Unicode character. We have to ensure that
+ // we can encode U+10FFFF + 1, even though it's technically outside the valid range.
+ // Luckily, UTF-8 can technically (there's that word again) encode up to U+1FFFFF,
+ // so the resulting string is byte-wise greater, and that's what matters since we
+ // don't guarantee that the successor string is _valid_ UTF-8.
+ // This problem does not happen with the target string, as it's an invalid character
+ // and will be replaced with the Unicode replacement char before we ever see it.
+ test_dfa_successor(dfa, "\xf4\x8f\xbf\xbf""xyz", // U+10FFFF
+ "\xf4\x90\x80\x80""food");// U+10FFFF+1
+}
+
+TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) {
+ auto dfa = LevenshteinDfa::build("", 1, dfa_type());
+ test_dfa_successor(dfa, "aa", "b");
+ test_dfa_successor(dfa, "b\x01", "c");
+ test_dfa_successor(dfa, "vespa", "w");
+}
+
+// We should normally be able to rely on higher-level components to ensure we
+// only receive valid UTF-8, but make sure we don't choke on it if we do get it.
+TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) {
+ // 0xff is not a valid encoding and is implicitly converted to U+FFFD,
+ // which is the standard Unicode replacement character.
+ EXPECT_EQ(calculate("\xff", "a", 2), std::optional{1});
+ EXPECT_EQ(calculate("\xff\xff", "a", 2), std::optional{2});
+ EXPECT_EQ(calculate("a", "\xff", 2), std::optional{1});
+ EXPECT_EQ(calculate("a", "\xff\xff\xff", 2), std::nullopt);
+ EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0});
+}
+
+TEST_P(LevenshteinDfaTest, unsupported_max_edits_value_throws) {
+ EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, dfa_type()), std::invalid_argument);
+ EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, dfa_type()), std::invalid_argument);
+}
+
+// Turn integer v into its bitwise string representation with the MSB as the leftmost character.
+template <std::unsigned_integral T>
+std::string bits_to_str(T v) {
+ constexpr const uint8_t n_bits = sizeof(T) * 8;
+ std::string ret(n_bits, '0');
+ for (uint8_t bit = 0; bit < n_bits; ++bit) {
+ if (v & (1 << bit)) {
+ ret[n_bits - bit - 1] = '1';
+ }
+ }
+ return ret;
+}
+
+using DfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::DfaType, uint32_t>;
+
+struct LevenshteinDfaSuccessorTest : TestWithParam<DfaTypeAndMaxEdits> {
+ // Print test suffix as e.g. "/Explicit_1" instead of just a GTest-chosen number.
+ static std::string stringify_params(const TestParamInfo<ParamType>& info) {
+ std::ostringstream ss;
+ ss << std::get<0>(info.param) << '_' << std::get<1>(info.param);
+ return ss.str();
+ }
+};
+
+INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
+ LevenshteinDfaSuccessorTest,
+ Combine(Values(LevenshteinDfa::DfaType::Explicit,
+ LevenshteinDfa::DfaType::Implicit),
+ Values(1, 2)),
+ LevenshteinDfaSuccessorTest::stringify_params);
+
+/**
+ * Exhaustively test successor generation by matching all target and source strings
+ * in {0,1}^8 against each other. Since we generate bit strings identical to the
+ * bit patterns of the underlying counter(s), any string at index `i+1` will compare
+ * lexicographically greater than the one at `i`. We use this to test that we never
+ * miss a valid match that comes between a mismatch and its generated successor.
+ *
+ * For each mismatch we note the successor it emitted. Verify that each subsequent
+ * match() invocation for a source string < the successor results in a mismatch.
+ *
+ * We test this for both max edit distance 1 and 2. Despite being an exhaustive test,
+ * this completes in a few dozen milliseconds even with ASan instrumentation.
+ *
+ * Inspired by approach used by Lucene DFA exhaustive testing.
+ */
+TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
+ const auto [dfa_type, max_edits] = GetParam();
+ for (uint32_t i = 0; i < 256; ++i) {
+ const auto target = bits_to_str(static_cast<uint8_t>(i));
+ auto target_dfa = LevenshteinDfa::build(target, max_edits, dfa_type);
+ std::string skip_to, successor;
+ for (uint32_t j = 0; j < 256; ++j) {
+ const auto source = bits_to_str(static_cast<uint8_t>(j));
+ auto maybe_match = target_dfa.match(source, &successor);
+ if (maybe_match.matches() && !skip_to.empty()) {
+ ASSERT_GE(source, skip_to);
+ } else if (!maybe_match.matches()) {
+ ASSERT_FALSE(successor.empty()) << source;
+ ASSERT_GE(successor, skip_to) << source;
+ ASSERT_GT(successor, source) << source;
+ skip_to = successor;
+ }
+ }
+ }
+}
+
+namespace {
+
+template <uint8_t MaxEdits>
+void explore(const DfaSteppingBase<FixedMaxEditDistanceTraits<MaxEdits>>& stepper,
+ const typename DfaSteppingBase<FixedMaxEditDistanceTraits<MaxEdits>>::StateType& in_state)
+{
+ ASSERT_EQ(stepper.can_match(stepper.step(in_state, WILDCARD)),
+ stepper.can_wildcard_step(in_state));
+ if (!stepper.can_match(in_state)) {
+ return; // reached the end of the line
+ }
+ // DFS-explore all matching transitions, as well as one non-matching transition
+ auto t = stepper.transitions(in_state);
+ for (uint32_t c: t.u32_chars()) {
+ ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.step(in_state, c)));
+ }
+ ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.step(in_state, WILDCARD)));
+}
+
+} // anon ns
+
+using StateStepperTypes = Types<
+ DfaSteppingBase<FixedMaxEditDistanceTraits<1>>,
+ DfaSteppingBase<FixedMaxEditDistanceTraits<2>>
+>;
+
+template <typename SteppingBase>
+struct LevenshteinSparseStateTest : Test {};
+
+TYPED_TEST_SUITE(LevenshteinSparseStateTest, StateStepperTypes);
+
+// "Meta-test" for checking that the `can_wildcard_step` predicate function is
+// functionally equivalent to evaluating `can_match(stepper.step(in_state, WILDCARD))`
+TYPED_TEST(LevenshteinSparseStateTest, wildcard_step_predcate_is_equivalent_to_step_with_can_match) {
+ for (const char* target : {"", "a", "ab", "abc", "abcdef", "aaaaa"}) {
+ auto u32_target = utf8_string_to_utf32(target);
+ TypeParam stepper(u32_target);
+ ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.start()));
+ }
+}
+
+template <typename T>
+void do_not_optimize_away(T&& t) noexcept {
+ asm volatile("" : : "m"(t) : "memory"); // Clobber the value to avoid losing it to compiler optimizations
+}
+
+enum class BenchmarkType {
+ DfaExplicit,
+ DfaImplicit,
+ Legacy
+};
+
+const char* to_s(BenchmarkType t) noexcept {
+ // Note: need underscores since this is used as part of GTest-generated test instance names
+ switch (t) {
+ case BenchmarkType::DfaExplicit: return "DFA_explicit";
+ case BenchmarkType::DfaImplicit: return "DFA_implicit";
+ case BenchmarkType::Legacy: return "legacy";
+ }
+ abort();
+}
+
+[[nodiscard]] bool benchmarking_enabled() noexcept {
+ return !benchmark_dictionary.empty();
+}
+
+[[nodiscard]] std::vector<uint32_t> string_lengths() {
+ return {2, 8, 16, 64, 256, 1024, 1024*16, 1024*64};
+}
+
+struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
+
+ static std::string stringify_params(const TestParamInfo<ParamType>& info) {
+ return to_s(info.param);
+ }
+
+ void SetUp() override {
+ if (!benchmarking_enabled()) {
+ GTEST_SKIP() << "benchmarking not enabled";
+ }
+ }
+
+ static BenchmarkType benchmark_type() noexcept { return GetParam(); }
+
+ static const std::vector<std::string>& load_dictionary_once() {
+ static auto sorted_lines = read_and_sort_all_lines(fs::path(benchmark_dictionary));
+ return sorted_lines;
+ }
+
+ static std::vector<std::string> read_and_sort_all_lines(const fs::path& file_path) {
+ std::ifstream ifs(file_path);
+ if (!ifs.is_open()) {
+ throw std::invalid_argument("File does not exist");
+ }
+ std::vector<std::string> lines;
+ std::string line;
+ while (std::getline(ifs, line)) {
+ lines.emplace_back(line);
+ }
+ std::sort(lines.begin(), lines.end());
+ return lines;
+ }
+};
+
+INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+ LevenshteinBenchmarkTest,
+ Values(BenchmarkType::DfaExplicit,
+ BenchmarkType::DfaImplicit,
+ BenchmarkType::Legacy),
+ LevenshteinBenchmarkTest::stringify_params);
+
+// ("abc", 1) => "a"
+// ("abc", 3) => "abc"
+// ("abc", 7) => "abcabca"
+// ... and so on.
+std::string repeated_string(std::string_view str, uint32_t sz) {
+ uint32_t chunks = sz / str.size();
+ std::string ret;
+ ret.reserve(sz);
+ for (uint32_t i = 0; i < chunks; ++i) {
+ ret += str;
+ }
+ uint32_t rem = sz % str.size();
+ ret += str.substr(0, rem);
+ return ret;
+}
+
+TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_time) {
+ using vespalib::BenchmarkTimer;
+ const auto type = benchmark_type();
+ fprintf(stderr, "------ %s ------\n", to_s(type));
+ for (uint8_t k : {1, 2}) {
+ for (uint32_t sz : string_lengths()) {
+ // Use same string as both source and target. This is the worst case in that the entire
+ // string must be matched and any sparse representation is always maximally filled since
+ // we never expend any edits via mismatches.
+ // Also ensure that we have multiple out-edges per node (i.e. don't just repeat "AAA" etc.).
+ std::string str = repeated_string("abcde", sz);
+ double min_time_s;
+ if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
+ auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
+ : LevenshteinDfa::DfaType::Implicit;
+ auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ min_time_s = BenchmarkTimer::benchmark([&] {
+ auto res = dfa.match(str, nullptr); // not benchmarking successor generation
+ do_not_optimize_away(res);
+ }, 1.0);
+ } else {
+ min_time_s = BenchmarkTimer::benchmark([&] {
+ auto str_u32 = utf8_string_to_utf32(str); // Must be done per term, so included in benchmark body
+ auto res = vespalib::LevenshteinDistance::calculate(str_u32, str_u32, k);
+ do_not_optimize_away(res);
+ }, 1.0);
+ }
+ fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0);
+ }
+ }
+}
+
+TEST(LevenshteinExplicitDfaBenchmarkTest, benchmark_explicit_dfa_construction) {
+ if (!benchmarking_enabled()) {
+ GTEST_SKIP() << "benchmarking not enabled";
+ }
+ using vespalib::BenchmarkTimer;
+ for (uint8_t k : {1, 2}) {
+ for (uint32_t sz : string_lengths()) {
+ std::string str = repeated_string("abcde", sz);
+ double min_time_s = BenchmarkTimer::benchmark([&] {
+ auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+ do_not_optimize_away(dfa);
+ }, 2.0);
+ auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+ size_t mem_usage = dfa.memory_usage();
+ fprintf(stderr, "k=%u, sz=%u: \t%g us \t%zu bytes\n", k, sz, min_time_s * 1000000.0, mem_usage);
+ }
+ }
+}
+
+TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
+ using vespalib::BenchmarkTimer;
+ const auto type = benchmark_type();
+ const auto dict = load_dictionary_once();
+ std::vector target_lengths = {1, 2, 4, 8, 12, 16, 24, 32, 64};
+ fprintf(stderr, "------ %s ------\n", to_s(type));
+ for (uint8_t k : {1, 2}) {
+ for (uint32_t sz : target_lengths) {
+ std::string str = repeated_string("abcde", sz);
+ double min_time_s;
+ if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
+ auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
+ : LevenshteinDfa::DfaType::Implicit;
+ auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ min_time_s = BenchmarkTimer::benchmark([&] {
+ for (const auto& line : dict) {
+ auto res = dfa.match(line, nullptr);
+ do_not_optimize_away(res);
+ }
+ }, 2.0);
+ } else {
+ min_time_s = BenchmarkTimer::benchmark([&] {
+ auto target_u32 = utf8_string_to_utf32(str);
+ for (const auto& line : dict) {
+ auto line_u32 = utf8_string_to_utf32(line);
+ auto res = vespalib::LevenshteinDistance::calculate(line_u32, target_u32, k);
+ do_not_optimize_away(res);
+ }
+ }, 2.0);
+ }
+ fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0);
+ }
+ }
+}
+
+TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) {
+ const auto type = benchmark_type();
+ if (type == BenchmarkType::Legacy) {
+ GTEST_SKIP() << "Skipping not supported for legacy implementation";
+ }
+ using vespalib::BenchmarkTimer;
+ const auto dict = load_dictionary_once();
+ std::vector target_lengths = {1, 2, 4, 8, 12, 16, 24, 32, 64};
+ fprintf(stderr, "------ %s ------\n", to_s(type));
+ for (uint8_t k : {1, 2}) {
+ for (uint32_t sz : target_lengths) {
+ std::string str = repeated_string("abcde", sz);
+ auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
+ : LevenshteinDfa::DfaType::Implicit;
+ auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ double min_time_s = BenchmarkTimer::benchmark([&] {
+ auto iter = dict.cbegin();
+ auto end = dict.cend();
+ std::string successor;
+ while (iter != end) {
+ auto maybe_match = dfa.match(*iter, &successor);
+ if (maybe_match.matches()) {
+ ++iter;
+ } else {
+ iter = std::lower_bound(iter, end, successor);
+ }
+ }
+ }, 2.0);
+ fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0);
+ }
+ }
+}
+
+// TODO:
+// - explicit successor generation benchmark
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ if (argc > 1) {
+ benchmark_dictionary = argv[1];
+ if (!fs::exists(fs::path(benchmark_dictionary))) {
+ fprintf(stderr, "Benchmark dictionary file '%s' does not exist\n", benchmark_dictionary.c_str());
+ return 1;
+ }
+ }
+ return RUN_ALL_TESTS();
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
index 1d770163e06..bdbb03bcfee 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
@@ -1,8 +1,12 @@
# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
vespa_add_library(vespalib_vespalib_fuzzy OBJECT
- SOURCES
+ SOURCES
+ explicit_levenshtein_dfa.cpp
fuzzy_matcher.cpp
+ implicit_levenshtein_dfa.cpp
+ levenshtein_dfa.cpp
levenshtein_distance.cpp
- DEPENDS
- )
+ unicode_utils.cpp
+ DEPENDS
+)
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
new file mode 100644
index 00000000000..c445c60cc01
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
@@ -0,0 +1,70 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <concepts>
+#include <cstdint>
+
+namespace vespalib::fuzzy {
+
+// Concept that all DFA matcher implementations must satisfy
+template <typename T>
+concept DfaMatcher = requires(T a) {
+ typename T::StateType;
+ typename T::StateParamType;
+ typename T::EdgeType;
+
+ // Initial (starting) state of the DFA
+ { a.start() } -> std::same_as<typename T::StateType>;
+
+ // Whether a given state constitutes a string match within the maximum number of edits
+ { a.is_match(typename T::StateType{}) } -> std::same_as<bool>;
+
+ // Whether a given state _may_ result in a match, either in the given state or in the
+ // future if the remaining string input is within the max edit distance
+ { a.can_match(typename T::StateType{}) } -> std::same_as<bool>;
+
+ // Whether the given state is a valid state. Used for invariant checking.
+ { a.valid_state(typename T::StateType{}) } -> std::same_as<bool>;
+
+ // Iff the given state represents a terminal matching state, returns the number of
+ // edits required to reach the state. Otherwise, returns max edits + 1.
+ { a.match_edit_distance(typename T::StateType{}) } -> std::same_as<uint8_t>;
+
+ // Returns the state that is the result of matching the single logical Levenshtein
+ // matrix row represented by the given state with the input u32 character value.
+ { a.match_input(typename T::StateType{}, uint32_t{}) } -> std::same_as<typename T::StateType>;
+
+ // Returns the state that is the result of matching the single logical Levenshtein
+ // matrix row represented by the given state with a sentinel character that cannot
+ // match any character in the target string (i.e. is always a mismatch).
+ { a.match_wildcard(typename T::StateType{}) } -> std::same_as<typename T::StateType>;
+
+ // Whether there is exists an out edge from the given state that can accept a
+ // _higher_ UTF-32 code point value (character) than the input u32 value. Such an
+ // edge _may_ be a wildcard edge, which accepts any character.
+ { a.has_higher_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>;
+
+ // Whether there exists an out edge from the given state whose u32 character value
+ // _exactly_ matches the input u32 value.
+ { a.has_exact_explicit_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>;
+
+ // Returns the out edge `e` from the given state that satisfies _both_:
+ // 1. higher than the given u32 value
+ // 2. no other out edges are lower than `e`
+ // Only called in a context where the caller already knows that such an edge must exist.
+ { a.lowest_higher_explicit_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<typename T::EdgeType>;
+
+ // Returns the out edge from the given state that has the lowest character value
+ { a.smallest_explicit_out_edge(typename T::StateType{}) } -> std::same_as<typename T::EdgeType>;
+
+ // Whether the given edge is a valid edge. Used for invariant checking.
+ { a.valid_edge(typename T::EdgeType{}) } -> std::same_as<bool>;
+
+ // For a given edge, returns the UTF-32 code point value the edge represents
+ { a.edge_to_u32char(typename T::EdgeType{}) } -> std::same_as<uint32_t>;
+
+ // Returns the state that is the result of following the given edge from the given state.
+ { a.edge_to_state(typename T::StateType{}, typename T::EdgeType{}) } -> std::same_as<typename T::StateType>;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h
new file mode 100644
index 00000000000..7e7881c5a14
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h
@@ -0,0 +1,299 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "sparse_state.h"
+#include <span>
+
+namespace vespalib::fuzzy {
+
+template <typename Traits>
+struct DfaSteppingBase {
+ using StateType = Traits::StateType;
+ using TransitionsType = Traits::TransitionsType;
+
+ std::span<const uint32_t> _u32_str; // TODO std::u32string_view
+
+ DfaSteppingBase(std::span<const uint32_t> str) noexcept
+ : _u32_str(str)
+ {
+ }
+
+ [[nodiscard]] static constexpr uint8_t max_edits() noexcept {
+ return Traits::max_edits();
+ }
+
+ /**
+ * Returns the initial state of the DFA. This represents the first row in the
+ * Levenshtein matrix.
+ */
+ [[nodiscard]] StateType start() const {
+ StateType ret;
+ const auto j = std::min(static_cast<uint32_t>(max_edits()),
+ static_cast<uint32_t>(_u32_str.size())); // e.g. the empty string as target
+ for (uint32_t i = 0; i <= j; ++i) {
+ ret.append(i, i);
+ }
+ return ret;
+ }
+
+ /**
+ * DFA stepping function that takes an input (sparse) state and a 32-bit character value
+ * (does not have to be valid UTF-32, but usually is) and generates a resulting state
+ * that represents applying the Levenshtein algorithm on a particular matrix row using
+ * the provided source string character.
+ *
+ * The returned state only includes elements where the edit distance (cost) is within
+ * the maximum number of edits. All other elements are implicitly beyond the max
+ * edit distance. It doesn't matter _how_ far beyond they are, since we have a fixed
+ * maximum to consider.
+ *
+ * Stepping a non-matching state S (can_match(S) == false) results in another non-
+ * matching state.
+ *
+ * As an example, this is a visualization of stepping through all source characters of
+ * the string "fxod" when matching the target string "food" with max edits k=1.
+ * Note: the actual internal representation is logical <column#, cost> tuples, but
+ * rendering as a matrix makes things easier to understand. Elements _not_ part of the
+ * state are rendered as '-'.
+ *
+ * f o o d
+ * start(): [0 1 - - -]
+ * 'f': [1 0 1 - -]
+ * 'x': [- 1 1 - -]
+ * 'o': [- - 1 1 -]
+ * 'd': [- - - - 1]
+ *
+ * In this case, the resulting edit distance is 1, with one substitution 'x' -> 'o'.
+ *
+ * If we pull out our trusty pen & paper and do the full matrix calculations, we see
+ * that the above is equivalent to the full matrix with all costs > k pruned away:
+ *
+ * f o o d
+ * [0 1 2 3 4]
+ * f [1 0 1 2 3]
+ * x [2 1 1 2 3]
+ * o [3 2 1 1 2]
+ * d [4 3 2 2 1]
+ *
+ * Since we're working on sparse states, stepping requires a bit of manual edge case
+ * handling when compared to a dense representation.
+ *
+ * We first have to handle the case where our state includes the 0th matrix column.
+ * In an explicit Levenshtein matrix of target string length n, source string length m,
+ * the first column is always the values [0, m], increasing with 1 per row (the first
+ * _row_ is handled by start()).
+ *
+ * To mirror this, if our sparse state includes column 0 we have to increment it by 1,
+ * unless doing so would bring the cost beyond our max number of edits, in which case
+ * we don't bother including the column in the new state at all. These correspond to
+ * the start() -> 'f' -> 'x' transitions in the example above.
+ *
+ * What remains is then to do the actual Levenshtein insert/delete/substitute formula
+ * for matching positions in the matrix. Let d represent the logical (full) Levenshtein
+ * distance matrix and cell d[i, j] be the minimum number of edits between source string
+ * character at i+1 and target string character at j+1:
+ *
+ * Insertion cost: d[i, j-1] + 1
+ * Deletion cost: d[i-1, j] + 1
+ * Substitution cost: d[i-1, j-1] + (s[i-1] == t[j-1] ? 1 :0)
+ *
+ * d[i, j] = min(Insertion cost, Deletion cost, Substitution cost)
+ *
+ * We have to turn this slightly on the head, as instead of going through a matrix row
+ * and "pulling" values from the previous row, we have to go through a state representing
+ * the previous row and "push" new values instead (iff these values are within max edits).
+ * This also means we compute costs for indexes offset by 1 from the source state index
+ * (can be visualized as the element one down diagonally to the right).
+ *
+ * Insertion considers the current row only, i.e. the state being generated. We always
+ * work left to right in column order, so we can check if the last element (if any)
+ * in our _new_ sparse state is equal to the index of our source state element. If not,
+ * we know that it was beyond max edits. Max edits + 1 is inherently beyond max edits
+ * and need not be included.
+ *
+ * Deletion considers the cell directly above our own, which is part of the input state
+ * if it exists. Since we're computing the costs of cells at index + 1, we know that the
+ * only way for this cell to be present in the state is if the _next_ element of our
+ * input state exists and has an index equal to index + 1. If so, the deletion cost is
+ * the cost recorded for this element + 1.
+ *
+ * Substitution considers the cell diagonally up to the left. This very conveniently
+ * happens to be the input state cell we're currently working from, so it's therefore
+ * always present.
+ *
+ * Example stepping with c='x', max edits k=1:
+ *
+ * ====== Initially ======
+ *
+ * f o o d
+ * state_in: [1 0 1 - -] (0:1, 1:0, 2:1)
+ * out: [] ()
+ *
+ * We have a 0th column in state_in, but incrementing it results in 2 > k, so not
+ * appended to out.
+ *
+ * ====== State (0:1), computing for index 1 ======
+ *
+ * Insertion: out state is empty (no cell to our left), so implicit insertion cost
+ * is > k
+ * Deletion: state_in[1] is (1:0), which means it represents the cell just above
+ * index 1. Deletion cost is therefore 0+1 = 1
+ * Substitution: (t[0] = 'f') != (c = 'x'), so substitution cost is 1+1 = 2
+ *
+ * Min cost is 1, which is <= k. Appending to output.
+ *
+ * out: [- 1] (1:1)
+ *
+ * ====== State (1:0), computing for index 2 ======
+ *
+ * Insertion: last element in out has index 1 (cell to our immediate left) with cost
+ * 1, so insertion cost is 1+1 = 2
+ * Deletion: state_in[2] is (2:1), which means it represents the cell just above
+ * index 2. Deletion cost is therefore 1+1 = 2
+ * Substitution: (t[1] = 'o') != (c = 'x'), so substitution cost is 0+1 = 1
+ *
+ * Min cost is 1, which is <= k. Appending to output.
+ *
+ * out: [- 1 1] (1:1, 2:1)
+ *
+ * ====== State (2:1), computing for index 3 ======
+ *
+ * Insertion: last element in out has index 2 (cell to our immediate left) with cost
+ * 1, so insertion cost is 1+1 = 2
+ * Deletion: state_in[3] does not exist, so implicit deletion cost is > k
+ * Substitution: (t[2] = 'o') != (c = 'x'), so substitution cost is 1+1 = 2
+ *
+ * Min cost is 2, which is > k. Not appending to output.
+ *
+ * Resulting output state (right-padded for clarity):
+ *
+ * [- 1 1 - -] (1:1, 2:1)
+ *
+ */
+ [[nodiscard]] StateType step(const StateType& state_in, uint32_t c) const {
+ if (state_in.empty()) {
+ return state_in; // A non-matching state can only step to another equally non-matching state
+ }
+ StateType new_state;
+ if ((state_in.index(0) == 0) && (state_in.cost(0) < max_edits())) {
+ new_state.append(0, state_in.cost(0) + 1);
+ }
+ for (uint32_t i = 0; i < state_in.size(); ++i) {
+ const auto idx = state_in.index(i);
+ if (idx == _u32_str.size()) [[unlikely]] {
+ break; // Can't process beyond matrix width
+ }
+ const uint8_t sub_cost = (_u32_str[idx] == c) ? 0 : 1;
+ // For our Levenshtein insert/delete/sub ops, we know that if a particular index is _not_
+ // in the sparse state, its implicit distance is beyond the max edits, and need not be
+ // considered.
+ auto dist = state_in.cost(i) + sub_cost; // (Substitution)
+ if (!new_state.empty() && (new_state.last_index() == idx)) { // (Insertion) anything to our immediate left?
+ dist = std::min(dist, new_state.last_cost() + 1);
+ }
+ if ((i < state_in.size() - 1) && (state_in.index(i + 1) == idx + 1)) { // (Deletion) anything immediately above?
+ dist = std::min(dist, state_in.cost(i + 1) + 1);
+ }
+ if (dist <= max_edits()) {
+ new_state.append(idx + 1, dist);
+ }
+ }
+ return new_state;
+ }
+
+ /**
+ * Simplified version of step() which does not assemble a new state, but only checks
+ * whether _any_ mismatching character can be substituted in and still result in a
+ * potentially matching state. This is the case if the resulting state would contain
+ * _at least one_ entry (recalling that we only retain entries that are within the
+ * max number of edits).
+ *
+ * Consider using this directly instead of `can_match(step(state, WILDCARD))`,
+ * which has the exact same semantics, but requires computing the full (sparse)
+ * state before checking if it has any element at all. can_wildcard_step() just
+ * jumps straight to the last part.
+ */
+ [[nodiscard]] bool can_wildcard_step(const StateType& state_in) const noexcept {
+ if (state_in.empty()) {
+ return false; // by definition
+ }
+ if ((state_in.index(0) == 0) && (state_in.cost(0) < max_edits())) {
+ return true;
+ }
+ for (uint32_t i = 0; i < state_in.size(); ++i) {
+ const auto idx = state_in.index(i);
+ if (idx == _u32_str.size()) [[unlikely]] {
+ break;
+ }
+ const uint8_t sub_cost = 1; // by definition
+ auto dist = state_in.cost(i) + sub_cost;
+ // Insertion only looks at the entries already computed in the current row
+ // and always increases the cost by 1. Since we always bail out immediately if
+ // there would have been at least one entry within max edits, we transitively
+ // know that since we have not bailed out yet there is no way we can get here
+ // and have insertion actually yield a match. So skip computing it entirely.
+ if ((i < state_in.size() - 1) && (state_in.index(i + 1) == idx + 1)) {
+ dist = std::min(dist, state_in.cost(i + 1) + 1);
+ }
+ if (dist <= max_edits()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks if the given state represents a terminal state within the max number of edits
+ */
+ [[nodiscard]] bool is_match(const StateType& state) const noexcept {
+ // If the last index is equal to the string's length, it means we were able to match
+ // the entire string and still be within the max edit distance.
+ return (!state.empty() && state.last_index() == static_cast<uint32_t>(_u32_str.size()));
+ }
+
+ /**
+ * Iff the input state represents a terminal matching state, returns the number of
+ * edits required to reach the state. Otherwise, returns max edits + 1.
+ */
+ [[nodiscard]] uint8_t match_edit_distance(const StateType& state) const noexcept {
+ if (!is_match(state)) {
+ return max_edits() + 1;
+ }
+ return state.last_cost();
+ }
+
+ /**
+ * Returns whether the given state _may_ end up matching the target string,
+ * depending on the remaining source string characters.
+ *
+ * Note: is_match(s) => can_match(s) is true, but
+ * can_match(s) => is_match(s) is false
+ */
+ [[nodiscard]] bool can_match(const StateType& state) const noexcept {
+ // The presence of any entries at all indicates that we may still potentially match
+ // the target string if the remaining input is within the maximum number of edits.
+ return !state.empty();
+ }
+
+ /**
+ * All valid character transitions from this state are those that are reachable
+ * within the max edit distance.
+ */
+ TransitionsType transitions(const StateType& state) const {
+ TransitionsType t;
+ for (size_t i = 0; i < state.size(); ++i) {
+ const auto idx = state.index(i);
+ if (idx < _u32_str.size()) [[likely]] {
+ t.add_char(_u32_str[idx]);
+ }
+ }
+ // We must ensure transitions are in increasing character order, so that the
+ // lowest possible higher char than any candidate char can be found with a
+ // simple "first-fit" linear scan.
+ t.sort();
+ return t;
+ }
+
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg b/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg
new file mode 100644
index 00000000000..0974e1d161f
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg
@@ -0,0 +1,286 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><!-- Generated by graphviz version 2.40.1 (20161225.0304)
+ --><!-- Title: levenshtein_dfa Pages: 1 --><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="333pt" height="488pt" viewBox="0.00 0.00 333.00 488.00">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 484)">
+<title>levenshtein_dfa</title>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-484 329,-484 329,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="#000000" cx="211" cy="-462" rx="18" ry="18"/>
+<text text-anchor="middle" x="211" y="-457.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">0</text>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="#000000" cx="157" cy="-373.2" rx="18" ry="18"/>
+<text text-anchor="middle" x="157" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">1</text>
+</g>
+<!-- 0&#45;&gt;1 -->
+<g id="edge1" class="edge">
+<title>0-&gt;1</title>
+<path fill="none" stroke="#000000" d="M201.5939,-446.5322C193.3592,-432.9906 181.2571,-413.0894 171.7374,-397.4348"/>
+<polygon fill="#000000" stroke="#000000" points="174.6658,-395.5142 166.4795,-388.7885 168.6849,-399.1513 174.6658,-395.5142"/>
+<text text-anchor="middle" x="189.9453" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<ellipse fill="none" stroke="#000000" cx="211" cy="-373.2" rx="18" ry="18"/>
+<text text-anchor="middle" x="211" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">2</text>
+</g>
+<!-- 0&#45;&gt;2 -->
+<g id="edge2" class="edge">
+<title>0-&gt;2</title>
+<path fill="none" stroke="#000000" d="M211,-443.6006C211,-431.4949 211,-415.4076 211,-401.6674"/>
+<polygon fill="#000000" stroke="#000000" points="214.5001,-401.272 211,-391.272 207.5001,-401.2721 214.5001,-401.272"/>
+<text text-anchor="middle" x="214.8913" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<ellipse fill="none" stroke="#000000" cx="265" cy="-373.2" rx="18" ry="18"/>
+<text text-anchor="middle" x="265" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">3</text>
+</g>
+<!-- 0&#45;&gt;3 -->
+<g id="edge3" class="edge">
+<title>0-&gt;3</title>
+<path fill="none" stroke="#000000" d="M220.4061,-446.5322C228.6408,-432.9906 240.7429,-413.0894 250.2626,-397.4348"/>
+<polygon fill="#000000" stroke="#000000" points="253.3151,-399.1513 255.5205,-388.7885 247.3342,-395.5142 253.3151,-399.1513"/>
+<text text-anchor="middle" x="244.7223" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<ellipse fill="none" stroke="#000000" cx="157" cy="-284.4" rx="18" ry="18"/>
+<text text-anchor="middle" x="157" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">4</text>
+</g>
+<!-- 1&#45;&gt;4 -->
+<g id="edge4" class="edge">
+<title>1-&gt;4</title>
+<path fill="none" stroke="#000000" d="M157,-354.8006C157,-342.6949 157,-326.6076 157,-312.8674"/>
+<polygon fill="#000000" stroke="#000000" points="160.5001,-312.472 157,-302.472 153.5001,-312.4721 160.5001,-312.472"/>
+<text text-anchor="middle" x="158.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text>
+</g>
+<!-- 1&#45;&gt;4 -->
+<g id="edge6" class="edge">
+<title>1-&gt;4</title>
+<path fill="none" stroke="#000000" d="M156.1231,-354.8902C155.8898,-349.221 155.6708,-342.9535 155.5554,-337.2 155.4057,-329.7348 155.4057,-327.8652 155.5554,-320.4 155.6041,-317.9728 155.6712,-315.454 155.7501,-312.9273"/>
+<polygon fill="#000000" stroke="#000000" points="159.2558,-312.8309 156.1231,-302.7098 152.2605,-312.5755 159.2558,-312.8309"/>
+<text text-anchor="middle" x="158.7223" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<ellipse fill="none" stroke="#000000" cx="103" cy="-284.4" rx="18" ry="18"/>
+<text text-anchor="middle" x="103" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">5</text>
+</g>
+<!-- 1&#45;&gt;5 -->
+<g id="edge5" class="edge">
+<title>1-&gt;5</title>
+<path fill="none" stroke="#000000" d="M147.5939,-357.7322C139.3592,-344.1906 127.2571,-324.2894 117.7374,-308.6348"/>
+<polygon fill="#000000" stroke="#000000" points="120.6658,-306.7142 112.4795,-299.9885 114.6849,-310.3513 120.6658,-306.7142"/>
+<text text-anchor="middle" x="138.8913" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<ellipse fill="none" stroke="#000000" cx="307" cy="-284.4" rx="18" ry="18"/>
+<text text-anchor="middle" x="307" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">6</text>
+</g>
+<!-- 2&#45;&gt;6 -->
+<g id="edge7" class="edge">
+<title>2-&gt;6</title>
+<path fill="none" stroke="#000000" d="M224.3484,-360.8527C240.3968,-346.0079 267.511,-320.9274 286.2858,-303.5607"/>
+<polygon fill="#000000" stroke="#000000" points="288.8006,-306.0022 293.765,-296.6424 284.0473,-300.8635 288.8006,-306.0022"/>
+<text text-anchor="middle" x="268.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<ellipse fill="none" stroke="#000000" cx="190" cy="-195.6" rx="18" ry="18"/>
+<text text-anchor="middle" x="190" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">7</text>
+</g>
+<!-- 2&#45;&gt;7 -->
+<g id="edge8" class="edge">
+<title>2-&gt;7</title>
+<path fill="none" stroke="#000000" d="M208.8708,-355.1934C205.2075,-324.212 197.6865,-260.606 193.3267,-223.7346"/>
+<polygon fill="#000000" stroke="#000000" points="196.7849,-223.1741 192.1348,-213.6543 189.8333,-223.9961 196.7849,-223.1741"/>
+<text text-anchor="middle" x="205.8913" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 3&#45;&gt;6 -->
+<g id="edge9" class="edge">
+<title>3-&gt;6</title>
+<path fill="none" stroke="#000000" d="M272.7034,-356.9127C278.881,-343.8515 287.6665,-325.2765 294.7989,-310.1966"/>
+<polygon fill="#000000" stroke="#000000" points="298.0914,-311.4211 299.2032,-300.8848 291.7635,-308.4281 298.0914,-311.4211"/>
+<text text-anchor="middle" x="290.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text>
+</g>
+<!-- 8 -->
+<g id="node9" class="node">
+<title>8</title>
+<ellipse fill="none" stroke="#000000" cx="263" cy="-195.6" rx="18" ry="18"/>
+<text text-anchor="middle" x="263" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">8</text>
+</g>
+<!-- 3&#45;&gt;8 -->
+<g id="edge10" class="edge">
+<title>3-&gt;8</title>
+<path fill="none" stroke="#000000" d="M264.7972,-355.1934C264.4483,-324.212 263.732,-260.606 263.3168,-223.7346"/>
+<polygon fill="#000000" stroke="#000000" points="266.8158,-223.6142 263.2033,-213.6543 259.8162,-223.6931 266.8158,-223.6142"/>
+<text text-anchor="middle" x="267.8913" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 4&#45;&gt;7 -->
+<g id="edge11" class="edge">
+<title>4-&gt;7</title>
+<path fill="none" stroke="#000000" d="M163.3627,-267.2785C168.1106,-254.5023 174.6869,-236.8062 180.1171,-222.194"/>
+<polygon fill="#000000" stroke="#000000" points="183.4541,-223.2619 183.6568,-212.669 176.8925,-220.8234 183.4541,-223.2619"/>
+<text text-anchor="middle" x="180.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 5&#45;&gt;7 -->
+<g id="edge14" class="edge">
+<title>5-&gt;7</title>
+<path fill="none" stroke="#000000" d="M115.8371,-271.2973C130.1833,-256.6543 153.5826,-232.7709 170.2601,-215.7483"/>
+<polygon fill="#000000" stroke="#000000" points="172.8948,-218.0603 177.393,-208.4678 167.8946,-213.1615 172.8948,-218.0603"/>
+<text text-anchor="middle" x="157.7223" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+<!-- 9 -->
+<g id="node10" class="node">
+<title>9</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="60" cy="-195.6" rx="18" ry="18"/>
+<text text-anchor="middle" x="60" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">9(1)</text>
+</g>
+<!-- 5&#45;&gt;9 -->
+<g id="edge12" class="edge">
+<title>5-&gt;9</title>
+<path fill="none" stroke="#000000" d="M95.1131,-268.1127C88.7885,-255.0515 79.7938,-236.4765 72.4916,-221.3966"/>
+<polygon fill="#000000" stroke="#000000" points="75.4909,-219.5597 67.9825,-212.0848 69.1907,-222.6105 75.4909,-219.5597"/>
+<text text-anchor="middle" x="89.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 10 -->
+<g id="node11" class="node">
+<title>10</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="126" cy="-195.6" rx="18" ry="18"/>
+<text text-anchor="middle" x="126" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">10(1)</text>
+</g>
+<!-- 5&#45;&gt;10 -->
+<g id="edge13" class="edge">
+<title>5-&gt;10</title>
+<path fill="none" stroke="#000000" d="M107.5441,-266.856C110.7887,-254.3287 115.2168,-237.2326 118.9222,-222.9264"/>
+<polygon fill="#000000" stroke="#000000" points="122.3462,-223.6657 121.4654,-213.1076 115.5698,-221.9105 122.3462,-223.6657"/>
+<text text-anchor="middle" x="120.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 6&#45;&gt;8 -->
+<g id="edge15" class="edge">
+<title>6-&gt;8</title>
+<path fill="none" stroke="#000000" d="M298.9297,-268.1127C292.4172,-254.9694 283.1382,-236.2426 275.6413,-221.1125"/>
+<polygon fill="#000000" stroke="#000000" points="278.5951,-219.1904 271.0191,-211.784 272.3229,-222.2984 278.5951,-219.1904"/>
+<text text-anchor="middle" x="291.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 11 -->
+<g id="node12" class="node">
+<title>11</title>
+<ellipse fill="none" stroke="#000000" cx="170" cy="-106.8" rx="18" ry="18"/>
+<text text-anchor="middle" x="170" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">11</text>
+</g>
+<!-- 7&#45;&gt;11 -->
+<g id="edge16" class="edge">
+<title>7-&gt;11</title>
+<path fill="none" stroke="#000000" d="M185.9527,-177.63C183.1694,-165.2722 179.4189,-148.6197 176.2537,-134.5662"/>
+<polygon fill="#000000" stroke="#000000" points="179.5848,-133.4268 173.973,-124.4402 172.7558,-134.9649 179.5848,-133.4268"/>
+<text text-anchor="middle" x="184.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 12 -->
+<g id="node13" class="node">
+<title>12</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="113" cy="-18" rx="18" ry="18"/>
+<text text-anchor="middle" x="113" y="-13.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">12(1)</text>
+</g>
+<!-- 7&#45;&gt;12 -->
+<g id="edge17" class="edge">
+<title>7-&gt;12</title>
+<path fill="none" stroke="#000000" d="M195.7305,-178.1814C201.8942,-156.377 209.2729,-118.3097 197,-88.8 185.9488,-62.2279 158.7426,-42.3831 138.2647,-30.5667"/>
+<polygon fill="#000000" stroke="#000000" points="139.8619,-27.4509 129.4116,-25.7072 136.4936,-33.5872 139.8619,-27.4509"/>
+<text text-anchor="middle" x="206.8913" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 8&#45;&gt;11 -->
+<g id="edge18" class="edge">
+<title>8-&gt;11</title>
+<path fill="none" stroke="#000000" d="M249.6754,-182.8771C234.3441,-168.2382 208.9734,-144.0133 190.9836,-126.8359"/>
+<polygon fill="#000000" stroke="#000000" points="192.9495,-123.8738 183.2999,-119.4993 188.1154,-128.9366 192.9495,-123.8738"/>
+<text text-anchor="middle" x="227.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 13 -->
+<g id="node14" class="node">
+<title>13</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="18" cy="-106.8" rx="18" ry="18"/>
+<text text-anchor="middle" x="18" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">13(1)</text>
+</g>
+<!-- 9&#45;&gt;13 -->
+<g id="edge19" class="edge">
+<title>9-&gt;13</title>
+<path fill="none" stroke="#000000" d="M44.2356,-186.804C34.7839,-180.5699 23.5878,-171.2526 18.2174,-159.6 14.7103,-151.9903 13.6799,-143.0834 13.8048,-134.7757"/>
+<polygon fill="#000000" stroke="#000000" points="17.3023,-134.9283 14.4812,-124.716 10.3181,-134.4586 17.3023,-134.9283"/>
+<text text-anchor="middle" x="22.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 9&#45;&gt;13 -->
+<g id="edge21" class="edge">
+<title>9-&gt;13</title>
+<path fill="none" stroke="#000000" d="M52.2966,-179.3127C46.119,-166.2515 37.3335,-147.6765 30.2011,-132.5966"/>
+<polygon fill="#000000" stroke="#000000" points="33.2365,-130.8281 25.7968,-123.2848 26.9086,-133.8211 33.2365,-130.8281"/>
+<text text-anchor="middle" x="45.7223" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+<!-- 14 -->
+<g id="node15" class="node">
+<title>14</title>
+<ellipse fill="#d3d3d3" stroke="#000000" cx="72" cy="-106.8" rx="18" ry="18"/>
+<text text-anchor="middle" x="72" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">14(0)</text>
+</g>
+<!-- 9&#45;&gt;14 -->
+<g id="edge20" class="edge">
+<title>9-&gt;14</title>
+<path fill="none" stroke="#000000" d="M62.4284,-177.63C64.0874,-165.353 66.3193,-148.8372 68.2105,-134.8421"/>
+<polygon fill="#000000" stroke="#000000" points="71.7044,-135.1219 69.5752,-124.7433 64.7675,-134.1845 71.7044,-135.1219"/>
+<text text-anchor="middle" x="71.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 10&#45;&gt;11 -->
+<g id="edge22" class="edge">
+<title>10-&gt;11</title>
+<path fill="none" stroke="#000000" d="M134.0703,-179.3127C140.5828,-166.1694 149.8618,-147.4426 157.3587,-132.3125"/>
+<polygon fill="#000000" stroke="#000000" points="160.6771,-133.4984 161.9809,-122.984 154.4049,-130.3904 160.6771,-133.4984"/>
+<text text-anchor="middle" x="155.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text>
+</g>
+<!-- 10&#45;&gt;12 -->
+<g id="edge23" class="edge">
+<title>10-&gt;12</title>
+<path fill="none" stroke="#000000" d="M124.6819,-177.5934C122.4142,-146.612 117.7583,-83.006 115.0594,-46.1346"/>
+<polygon fill="#000000" stroke="#000000" points="118.5423,-45.772 114.3215,-36.0543 111.561,-46.2831 118.5423,-45.772"/>
+<text text-anchor="middle" x="124.8913" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 11&#45;&gt;12 -->
+<g id="edge24" class="edge">
+<title>11-&gt;12</title>
+<path fill="none" stroke="#000000" d="M161.5849,-90.7772C155.8346,-80.1448 147.859,-65.9928 140,-54 137.0775,-49.5402 133.7942,-44.8997 130.5502,-40.4917"/>
+<polygon fill="#000000" stroke="#000000" points="133.2767,-38.294 124.4657,-32.4105 127.6846,-42.5045 133.2767,-38.294"/>
+<text text-anchor="middle" x="153.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 13&#45;&gt;12 -->
+<g id="edge25" class="edge">
+<title>13-&gt;12</title>
+<path fill="none" stroke="#000000" d="M29.0516,-92.0851C37.7254,-81.0086 50.4346,-65.7741 63.2174,-54 71.136,-46.7062 80.5495,-39.5675 89.0481,-33.5954"/>
+<polygon fill="#000000" stroke="#000000" points="91.192,-36.3695 97.4722,-27.8367 87.2416,-30.5907 91.192,-36.3695"/>
+<text text-anchor="middle" x="67.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 14&#45;&gt;12 -->
+<g id="edge26" class="edge">
+<title>14-&gt;12</title>
+<path fill="none" stroke="#000000" d="M79.7118,-90.0974C85.7248,-77.0741 94.1818,-58.7574 101.0669,-43.8455"/>
+<polygon fill="#000000" stroke="#000000" points="104.3085,-45.1739 105.3228,-34.6277 97.9532,-42.2395 104.3085,-45.1739"/>
+<text text-anchor="middle" x="100.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text>
+</g>
+<!-- 14&#45;&gt;12 -->
+<g id="edge27" class="edge">
+<title>14-&gt;12</title>
+<path fill="none" stroke="#000000" d="M71.9481,-88.4225C72.5604,-77.8966 74.4493,-64.6966 79.5554,-54 82.4978,-47.8362 86.8709,-42.0067 91.4921,-36.9052"/>
+<polygon fill="#000000" stroke="#000000" points="94.2341,-39.1107 98.7881,-29.5445 89.2625,-34.1828 94.2341,-39.1107"/>
+<text text-anchor="middle" x="82.7223" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text>
+</g>
+</g>
+</svg> \ No newline at end of file
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp
new file mode 100644
index 00000000000..f78de5cc082
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp
@@ -0,0 +1,11 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "explicit_levenshtein_dfa.hpp"
+
+namespace vespalib::fuzzy {
+
+template class ExplicitLevenshteinDfaImpl<1>;
+template class ExplicitLevenshteinDfaImpl<2>;
+template class ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>;
+template class ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>;
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
new file mode 100644
index 00000000000..49baad21530
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -0,0 +1,147 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "dfa_stepping_base.h"
+#include "levenshtein_dfa.h"
+#include "sparse_state.h"
+#include "unicode_utils.h"
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+// A doomed state is one that cannot possibly match the target string
+constexpr const uint32_t DOOMED = UINT32_MAX;
+
+template <uint8_t MaxEdits>
+struct DfaNode {
+ static constexpr uint8_t MaxCharOutEdges = diag(MaxEdits); // Not counting wildcard edge
+
+ struct Edge {
+ uint32_t u32ch;
+ uint32_t node;
+ };
+
+ std::array<Edge, MaxCharOutEdges> match_out_edges_buf;
+ uint32_t wildcard_edge_to = DOOMED;
+ uint8_t num_match_out_edges = 0;
+ uint8_t edits = UINT8_MAX;
+
+ [[nodiscard]] bool has_wildcard_edge() const noexcept {
+ return wildcard_edge_to != DOOMED;
+ }
+
+ [[nodiscard]] uint32_t wildcard_edge_to_or_doomed() const noexcept {
+ return wildcard_edge_to;
+ }
+
+ [[nodiscard]] std::span<const Edge> match_out_edges() const noexcept {
+ return std::span(match_out_edges_buf.begin(), num_match_out_edges);
+ }
+
+ [[nodiscard]] uint32_t match_or_doomed(uint32_t ch) const noexcept {
+ // Always prefer the exact matching edges
+ for (const auto& e : match_out_edges()) {
+ if (e.u32ch == ch) {
+ return e.node;
+ }
+ }
+ // Fallback to wildcard edge if possible (could be doomed)
+ return wildcard_edge_to;
+ }
+
+ [[nodiscard]] bool has_exact_match(uint32_t ch) const noexcept {
+ for (const auto& e : match_out_edges()) {
+ if (e.u32ch == ch) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ [[nodiscard]] size_t has_higher_out_edge(uint32_t ch) const noexcept {
+ if (has_wildcard_edge()) {
+ return true; // implicitly possible to substitute a higher out edge char
+ }
+ return lowest_higher_explicit_out_edge(ch) != nullptr;
+ }
+
+ [[nodiscard]] const Edge* lowest_higher_explicit_out_edge(uint32_t ch) const noexcept {
+ // Important: these _must_ be sorted in increasing code point order
+ for (const auto& e : match_out_edges()) {
+ if (e.u32ch > ch) {
+ return &e;
+ }
+ }
+ return nullptr;
+ }
+
+ void add_match_out_edge(uint32_t out_char, uint32_t out_node) noexcept {
+ assert(num_match_out_edges < MaxCharOutEdges);
+ match_out_edges_buf[num_match_out_edges] = Edge(out_char, out_node);
+ ++num_match_out_edges;
+ }
+
+ void set_wildcard_out_edge(uint32_t out_node) noexcept {
+ assert(wildcard_edge_to == DOOMED);
+ wildcard_edge_to = out_node;
+ }
+};
+
+template <uint8_t MaxEdits>
+class ExplicitLevenshteinDfaImpl final : public LevenshteinDfa::Impl {
+public:
+ static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2);
+
+ using DfaNodeType = DfaNode<MaxEdits>;
+ using MatchResult = LevenshteinDfa::MatchResult;
+private:
+ std::vector<DfaNodeType> _nodes;
+public:
+ ExplicitLevenshteinDfaImpl() noexcept = default;
+ ~ExplicitLevenshteinDfaImpl() override = default;
+
+ static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
+
+ void ensure_node_array_large_enough_for_index(uint32_t node_index) {
+ if (node_index >= _nodes.size()) {
+ _nodes.resize(node_index + 1);
+ }
+ }
+
+ void set_node_edit_distance(uint32_t node_index, uint8_t edits) {
+ _nodes[node_index].edits = edits;
+ }
+
+ void add_outgoing_edge(uint32_t from_node_idx, uint32_t to_node_idx, uint32_t out_char) {
+ _nodes[from_node_idx].add_match_out_edge(out_char, to_node_idx);
+ }
+
+ void set_wildcard_edge(uint32_t from_node_idx, uint32_t to_node_idx) {
+ _nodes[from_node_idx].set_wildcard_out_edge(to_node_idx);
+ }
+
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
+
+ [[nodiscard]] size_t memory_usage() const noexcept override {
+ return sizeof(DfaNodeType) * _nodes.size();
+ }
+
+ void dump_as_graphviz(std::ostream& os) const override;
+};
+
+template <typename Traits>
+class ExplicitLevenshteinDfaBuilder {
+ std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+public:
+ explicit ExplicitLevenshteinDfaBuilder(std::string_view str)
+ : ExplicitLevenshteinDfaBuilder(utf8_string_to_utf32(str))
+ {}
+
+ explicit ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str) noexcept
+ : _u32_str_buf(std::move(str))
+ {}
+
+ [[nodiscard]] LevenshteinDfa build_dfa() const;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
new file mode 100644
index 00000000000..0960219aff3
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -0,0 +1,228 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "explicit_levenshtein_dfa.h"
+#include "match_algorithm.hpp"
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
+#include <iostream>
+#include <span>
+#include <queue>
+
+namespace vespalib::fuzzy {
+
+// DfaMatcher adapter for explicit DFA implementation
+template <uint8_t MaxEdits>
+struct ExplicitDfaMatcher {
+ using DfaNodeType = typename ExplicitLevenshteinDfaImpl<MaxEdits>::DfaNodeType;
+ using StateType = const DfaNodeType*;
+ using EdgeType = const DfaNodeType::Edge*;
+
+ using StateParamType = const DfaNodeType*;
+
+ const std::span<const DfaNodeType> _nodes;
+
+ explicit ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes) noexcept
+ : _nodes(nodes)
+ {}
+
+ static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
+
+ StateType start() const noexcept {
+ return &_nodes[0];
+ }
+ bool has_higher_out_edge(StateType node, uint32_t mch) const noexcept {
+ return node->has_higher_out_edge(mch);
+ }
+ StateType match_input(StateType node, uint32_t mch) const noexcept {
+ auto maybe_node_idx = node->match_or_doomed(mch);
+ return ((maybe_node_idx != DOOMED) ? &_nodes[maybe_node_idx] : nullptr);
+ }
+ bool is_match(StateType node) const noexcept {
+ return node->edits <= max_edits();
+ }
+ bool can_match(StateType node) const noexcept {
+ return node != nullptr;
+ }
+ uint8_t match_edit_distance(StateType node) const noexcept {
+ return node->edits;
+ }
+ bool valid_state(StateType node) const noexcept {
+ return node != nullptr;
+ }
+ StateType match_wildcard(StateType node) const noexcept {
+ auto edge_to = node->wildcard_edge_to_or_doomed();
+ return ((edge_to != DOOMED) ? &_nodes[edge_to] : nullptr);
+ }
+ bool has_exact_explicit_out_edge(StateType node, uint32_t ch) const noexcept {
+ return node->has_exact_match(ch);
+ }
+ EdgeType lowest_higher_explicit_out_edge(StateType node, uint32_t ch) const noexcept {
+ return node->lowest_higher_explicit_out_edge(ch);
+ }
+ EdgeType smallest_explicit_out_edge(StateType node) const noexcept {
+ // Out-edges are pre-ordered in increasing code point order, so the first
+ // element is always the smallest possible matching character.
+ assert(!node->match_out_edges().empty());
+ return &node->match_out_edges().front();
+ }
+ bool valid_edge(EdgeType edge) const noexcept {
+ return edge != nullptr;
+ }
+ uint32_t edge_to_u32char(EdgeType edge) const noexcept {
+ return edge->u32ch;
+ }
+ StateType edge_to_state([[maybe_unused]] StateType node, EdgeType edge) const noexcept {
+ return &_nodes[edge->node];
+ }
+};
+
+template <uint8_t MaxEdits>
+LevenshteinDfa::MatchResult
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
+ ExplicitDfaMatcher<MaxEdits> matcher(_nodes);
+ return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
+}
+
+template <uint8_t MaxEdits>
+void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) const {
+ os << std::dec << "digraph levenshtein_dfa {\n";
+ os << " fontname=\"Helvetica,Arial,sans-serif\"\n";
+ os << " node [shape=circle, fontname=\"Helvetica,Arial,sans-serif\", fixedsize=true];\n";
+ os << " edge [fontname=\"Helvetica,Arial,sans-serif\"];\n";
+ for (size_t i = 0; i < _nodes.size(); ++i) {
+ const auto& node = _nodes[i];
+ if (node.edits <= max_edits()) {
+ os << " " << i << " [label=\"" << i << "(" << static_cast<int>(node.edits) << ")\", style=\"filled\"];\n";
+ }
+ for (const auto& edge : node.match_out_edges()) {
+ std::string as_utf8;
+ append_utf32_char_as_utf8(as_utf8, edge.u32ch);
+ os << " " << i << " -> " << edge.node << " [label=\"" << as_utf8 << "\"];\n";
+ }
+ if (node.wildcard_edge_to != DOOMED) {
+ os << " " << i << " -> " << node.wildcard_edge_to << " [label=\"*\"];\n";
+ }
+ }
+ os << "}\n";
+}
+
+namespace {
+
+template <typename StateType>
+struct ExploreState {
+ using NodeIdAndExplored = std::pair<uint32_t, bool>;
+ using SparseExploredStates = vespalib::hash_map<StateType, NodeIdAndExplored, typename StateType::hash>;
+
+ uint32_t state_counter;
+ SparseExploredStates explored_states;
+
+ ExploreState();
+ ~ExploreState();
+
+ [[nodiscard]] SparseExploredStates::iterator node_of(const StateType& state) {
+ auto maybe_explored = explored_states.find(state);
+ if (maybe_explored != explored_states.end()) {
+ return maybe_explored;
+ }
+ uint32_t this_node = state_counter;
+ assert(state_counter < UINT32_MAX);
+ ++state_counter;
+ return explored_states.insert(std::make_pair(state, std::make_pair(this_node, false))).first; // not yet explored;
+ }
+
+ [[nodiscard]] bool already_explored(const SparseExploredStates::iterator& node) const noexcept {
+ return node->second.second;
+ }
+
+ void tag_as_explored(SparseExploredStates::iterator& node) noexcept {
+ node->second.second = true;
+ }
+};
+
+template <typename StateType>
+ExploreState<StateType>::ExploreState()
+ : state_counter(0),
+ explored_states()
+{}
+
+template <typename StateType>
+ExploreState<StateType>::~ExploreState() = default;
+
+template <typename Traits>
+class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase<Traits> {
+ using Base = DfaSteppingBase<Traits>;
+
+ using StateType = typename Base::StateType;
+ using TransitionsType = typename Base::TransitionsType;
+
+ using Base::_u32_str;
+ using Base::max_edits;
+ using Base::start;
+ using Base::match_edit_distance;
+ using Base::step;
+ using Base::is_match;
+ using Base::can_match;
+ using Base::transitions;
+public:
+ explicit ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str) noexcept
+ : DfaSteppingBase<Traits>(str)
+ {
+ assert(str.size() < UINT32_MAX / max_out_edges_per_node());
+ }
+
+ [[nodiscard]] static constexpr uint8_t max_out_edges_per_node() noexcept {
+ // Max possible out transition characters (2k+1) + one wildcard edge.
+ return diag(max_edits()) + 1;
+ }
+
+ [[nodiscard]] LevenshteinDfa build_dfa() const;
+};
+
+template <typename Traits>
+LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
+ auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>();
+ ExploreState<StateType> exp;
+ // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_
+ // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth.
+ // This does _not_ always hold, such as if you have A->B and A->C->B (i.e. both parent and
+ // grandparent have a transition to the same state), in which case B may be allocated before C.
+ std::queue<StateType> to_explore;
+ to_explore.push(start());
+ while (!to_explore.empty()) {
+ auto state = std::move(to_explore.front());
+ to_explore.pop();
+ auto this_node = exp.node_of(state); // note: invalidated by subsequent calls to node_of
+ if (exp.already_explored(this_node)) {
+ continue;
+ }
+ exp.tag_as_explored(this_node);
+ const auto this_node_idx = this_node->second.first;
+ dfa->ensure_node_array_large_enough_for_index(this_node_idx);
+ dfa->set_node_edit_distance(this_node_idx, match_edit_distance(state));
+ auto t = transitions(state);
+ for (uint32_t out_c : t.u32_chars()) {
+ auto new_state = step(state, out_c);
+ auto out_node = exp.node_of(new_state);
+ dfa->add_outgoing_edge(this_node_idx, out_node->second.first, out_c);
+ to_explore.push(std::move(new_state));
+ }
+ auto wildcard_state = step(state, WILDCARD);
+ if (can_match(wildcard_state)) {
+ auto out_node = exp.node_of(wildcard_state);
+ dfa->set_wildcard_edge(this_node_idx, out_node->second.first);
+ to_explore.push(std::move(wildcard_state));
+ } // else: don't bother
+ }
+ return LevenshteinDfa(std::move(dfa));
+}
+
+} // anon ns
+
+template <typename Traits>
+LevenshteinDfa ExplicitLevenshteinDfaBuilder<Traits>::build_dfa() const {
+ ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf);
+ return builder.build_dfa();
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
new file mode 100644
index 00000000000..8b9d2eddcac
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
@@ -0,0 +1,9 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "implicit_levenshtein_dfa.hpp"
+
+namespace vespalib::fuzzy {
+
+template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>;
+template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>;
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
new file mode 100644
index 00000000000..0846b95d135
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -0,0 +1,35 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "levenshtein_dfa.h"
+#include "unicode_utils.h"
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+template <typename Traits>
+class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl {
+ std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+public:
+ using MatchResult = LevenshteinDfa::MatchResult;
+
+ explicit ImplicitLevenshteinDfa(std::string_view str)
+ : ImplicitLevenshteinDfa(utf8_string_to_utf32(str))
+ {}
+
+ explicit ImplicitLevenshteinDfa(std::vector<uint32_t> str) noexcept
+ : _u32_str_buf(std::move(str))
+ {}
+
+ ~ImplicitLevenshteinDfa() override = default;
+
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
+
+ [[nodiscard]] size_t memory_usage() const noexcept override {
+ return _u32_str_buf.size() * sizeof(uint32_t);
+ }
+
+ void dump_as_graphviz(std::ostream& os) const override;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
new file mode 100644
index 00000000000..4ee468e424b
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -0,0 +1,121 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "dfa_stepping_base.h"
+#include "implicit_levenshtein_dfa.h"
+#include "match_algorithm.hpp"
+#include "sparse_state.h"
+#include <cassert>
+#include <stdexcept>
+
+namespace vespalib::fuzzy {
+
+// DfaMatcher adapter for implicit DFA implementation
+template <typename Traits>
+struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
+ using Base = DfaSteppingBase<Traits>;
+
+ using StateType = typename Base::StateType;
+ using EdgeType = uint32_t; // Just the raw u32 character value
+
+ using StateParamType = const StateType&;
+
+ using Base::_u32_str;
+ using Base::max_edits;
+ using Base::start;
+ using Base::match_edit_distance;
+ using Base::step;
+ using Base::can_wildcard_step;
+ using Base::is_match;
+ using Base::can_match;
+
+ explicit ImplicitDfaMatcher(std::span<const uint32_t> u32_str) noexcept
+ : Base(u32_str)
+ {}
+
+ // start, is_match, can_match, match_edit_distance are all provided by base type
+
+ template <typename F>
+ bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) {
+ for (uint32_t i = 0; i < state.size(); ++i) {
+ const auto idx = state.index(i);
+ if ((idx < _u32_str.size()) && f(_u32_str[idx])) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ template <typename F>
+ void for_each_char(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) {
+ for (uint32_t i = 0; i < state.size(); ++i) {
+ const auto idx = state.index(i);
+ if ((idx < _u32_str.size())) [[likely]] {
+ f(_u32_str[idx]);
+ }
+ }
+ }
+
+ bool has_explicit_higher_out_edge(const StateType& state, uint32_t ch) const noexcept {
+ return has_any_char_matching(state, [ch](uint32_t state_ch) noexcept {
+ return state_ch > ch;
+ });
+ }
+
+ bool has_higher_out_edge(const StateType& state, uint32_t mch) const noexcept {
+ return (has_explicit_higher_out_edge(state, mch) || can_wildcard_step(state));
+ }
+ StateType match_input(const StateType& state, uint32_t mch) const noexcept {
+ return step(state, mch);
+ }
+ bool valid_state(const StateType& state) const noexcept {
+ return !state.empty();
+ }
+ StateType match_wildcard(const StateType& state) const noexcept {
+ return step(state, WILDCARD);
+ }
+ bool has_exact_explicit_out_edge(const StateType& state, uint32_t ch) const noexcept {
+ return has_any_char_matching(state, [ch](uint32_t state_ch) noexcept {
+ return state_ch == ch;
+ });
+ }
+ EdgeType lowest_higher_explicit_out_edge(const StateType& state, uint32_t ch) const noexcept {
+ uint32_t min_ch = UINT32_MAX;
+ for_each_char(state, [ch, &min_ch](uint32_t state_ch) noexcept {
+ if ((state_ch > ch) && (state_ch < min_ch)) {
+ min_ch = state_ch;
+ }
+ });
+ return min_ch;
+ }
+ EdgeType smallest_explicit_out_edge(const StateType& state) const noexcept {
+ uint32_t min_ch = UINT32_MAX;
+ for_each_char(state, [&min_ch](uint32_t state_ch) noexcept {
+ min_ch = std::min(min_ch, state_ch);
+ });
+ return min_ch;
+ }
+ bool valid_edge(EdgeType edge) const noexcept {
+ return edge != UINT32_MAX;
+ }
+ uint32_t edge_to_u32char(EdgeType edge) const noexcept {
+ return edge;
+ }
+ StateType edge_to_state(const StateType& state, EdgeType edge) const noexcept {
+ return step(state, edge);
+ }
+};
+
+template <typename Traits>
+LevenshteinDfa::MatchResult
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
+ ImplicitDfaMatcher<Traits> matcher(_u32_str_buf);
+ return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
+}
+
+template <typename Traits>
+void ImplicitLevenshteinDfa<Traits>::dump_as_graphviz(std::ostream&) const {
+ throw std::runtime_error("Graphviz output not available for implicit Levenshtein DFA");
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
new file mode 100644
index 00000000000..e75ef8365bf
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -0,0 +1,83 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "explicit_levenshtein_dfa.h"
+#include "implicit_levenshtein_dfa.h"
+#include "levenshtein_dfa.h"
+#include <vespa/vespalib/util/stringfmt.h>
+#include <memory>
+
+namespace vespalib::fuzzy {
+
+LevenshteinDfa::LevenshteinDfa(std::unique_ptr<Impl> impl) noexcept
+ : _impl(std::move(impl))
+{}
+
+LevenshteinDfa::LevenshteinDfa(LevenshteinDfa&&) noexcept = default;
+LevenshteinDfa& LevenshteinDfa::operator=(LevenshteinDfa&&) noexcept = default;
+
+LevenshteinDfa::~LevenshteinDfa() = default;
+
+LevenshteinDfa::MatchResult
+LevenshteinDfa::match(std::string_view u8str, std::string* successor_out) const {
+ return _impl->match(u8str, successor_out);
+}
+
+size_t LevenshteinDfa::memory_usage() const noexcept {
+ return _impl->memory_usage();
+}
+
+void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const {
+ _impl->dump_as_graphviz(out);
+}
+
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, DfaType dfa_type) {
+ if (max_edits != 1 && max_edits != 2) {
+ throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits));
+ }
+ if (dfa_type == DfaType::Implicit) {
+ if (max_edits == 1) {
+ return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(target_string));
+ } else { // max_edits == 2
+ return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(target_string));
+ }
+ } else { // DfaType::Explicit
+ if (max_edits == 1) {
+ return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(target_string).build_dfa();
+ } else { // max_edits == 2
+ return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(target_string).build_dfa();
+ }
+ }
+
+}
+
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits) {
+ // TODO automatically select implementation based on target length/max edits?
+ // Suggestion:
+ // - Explicit DFA iff (k == 1 && |target| <= 256) || (k == 2 && |target| <= 64).
+ // - Implicit DFA otherwise.
+ // This keeps memory overhead < 64k and DFA construction time < 300 usec (measured on
+ // an M1 Pro; your mileage may vary etc).
+ // Ideally the implicit DFA would always be the fastest (or at least approximately as
+ // fast as the explicit DFA), but this is not yet the case.
+ return build(target_string, max_edits, DfaType::Implicit);
+}
+
+std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos) {
+ if (mos.matches()) {
+ os << "match(" << static_cast<int>(mos.edits()) << " edits)";
+ } else {
+ os << "mismatch";
+ }
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) {
+ if (dt == LevenshteinDfa::DfaType::Implicit) {
+ os << "Implicit";
+ } else {
+ assert(dt == LevenshteinDfa::DfaType::Explicit);
+ os << "Explicit";
+ }
+ return os;
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
new file mode 100644
index 00000000000..a26ccbe87ee
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -0,0 +1,244 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace vespalib::fuzzy {
+
+/**
+ * Levenshtein Deterministic Finite Automata (DFA)
+ *
+ * The Levenshtein distance (or edit distance) is the minimum number of edits (additions,
+ * deletions or substitutions) needed to transform a particular source string s to a
+ * particular target string t.
+ *
+ * Let m be the length of the source string and n be the length of the target string.
+ *
+ * The classic dynamic programming algorithm uses a n x m cost matrix and is therefore
+ * O(nm) in space and time. By observing that only 2 rows of the matrix are actually
+ * needed, this is commonly reduced to O(n) space complexity (still O(nm) time complexity).
+ * When the maximum number of allowed edits is constrained to k, some clever observations
+ * about the nature of the cost matrix allows for reducing the time complexity down to
+ * O(kn) (more specifically, O((2k+1) * n)). When k is fixed (e.g. k in {1, 2}), the
+ * time complexity simplifies down to O(n).
+ *
+ * This implements code for building and evaluating Levenshtein Deterministic Finite
+ * Automata, where the resulting DFA efficiently matches all possible source strings that
+ * can be transformed to the target string within k max edits. This allows for easy linear
+ * matching of strings.
+ *
+ * Inspiration:
+ * - http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata
+ * - https://julesjacobs.com/2015/06/17/disqus-levenshtein-simple-and-fast.html
+ *
+ * The latter in particular was a close inspiration for the sparse DFA state management.
+ *
+ * ====== Dictionary skipping via successor string generation ======
+ *
+ * Scanning for edit distance matches frequently takes place against a sorted dictionary.
+ * When matching using a DFA, in the case where the source string does _not_ match, we can
+ * generate the _successor_ string; the next matching string that is lexicographically
+ * _greater_ than the source string. This string has the invariant that there are no
+ * possibly matching strings within k edits ordered after the source string but before
+ * the successor.
+ *
+ * This lets us do possibly massive leaps forward in the dictionary, turning a dictionary
+ * scan into a sublinear operation.
+ *
+ * Note that the implemented successor algorithm is slightly different from that described
+ * in the above blog post. The implemented algorithm requires zero extra data structures
+ * than the DFA itself and the target string and tries to be extra clever with reducing
+ * the number of code point conversions required
+ *
+ * ====== Unicode support ======
+ *
+ * Matching and successor generation is fully Unicode-aware. All input strings are expected
+ * to be in UTF-8, and the generated successor is also encoded as UTF-8 (with some caveats;
+ * see the documentation for match()).
+ *
+ * Internally, matching is done on UTF-32 code points and the DFA itself is built around
+ * UTF-32. This is unlike Lucene, which converts a UTF-32 DFA to an equivalent UTF-8 DFA.
+ *
+ * ====== Memory usage ======
+ *
+ * There is always a baseline DFA memory usage O(n) in the target string, as the
+ * underlying DFA needs to convert the input UTF-8 string to explicit UTF-32 chars.
+ *
+ * Aside from the baseline, memory usage depends on whether an explicit or implicit DFA
+ * is used.
+ *
+ * ------ Explicit DFA ------
+ *
+ * The explicit DFA graph takes up quite a bit more memory than the original string
+ * representation (one reason is the use of UTF-32 characters under the hood).
+ *
+ * Expected upper bound memory usage for a string of length n with max edits k is
+ *
+ * (2k+1) * N(k) * n * W(k)
+ *
+ * where N(1) is expected to be 32 and N(2) is 48, W(1) is 1.34 and W(2) is 3.2 (empirically
+ * derived).
+ *
+ * Memory usage during building is higher due to keeping track of the set of generated
+ * states in a hash table, but still linear in input size. This extra memory is freed
+ * once building is complete.
+ *
+ * ------ Implicit DFA ------
+ *
+ * Implicit DFAs have a O(1) memory usage during evaluation, which all lives on the stack
+ * or in registers (this does not include the successor string, which is provided by the
+ * caller).
+ *
+ * Since the sparse state stepping is currently not as fast as explicit DFA node traversal,
+ * string matching is slower than with the explicit DFA.
+ *
+ * ====== In short ======
+ *
+ * - Immutable; build once, run many times.
+ * - Explicit DFA build time is amortized linear in target string size.
+ * - Implicit DFA build time is O(1) (aside from initial UTF-32 conversion)
+ * - Zero-allocation matching.
+ * - Matching takes in raw UTF-8 input, no need to pre-convert.
+ * - Streaming UTF-8 to UTF-32 conversion; fully unicode-aware (DFA uses UTF-32 code
+ * points internally).
+ * - If required, it's possible (but not currently implemented) to bake case
+ * insensitive matching semantics into the generated DFA itself.
+ * - Allows for dictionary forward-skipping via successor algorithm.
+ * - Amortized zero allocations for successor string building when reusing string
+ * between matches.
+ * - Successor string is generated in-place as UTF-8 and can be directly used as input
+ * to a byte-wise dictionary seek.
+ */
+class LevenshteinDfa {
+public:
+ class MatchResult {
+ uint8_t _max_edits;
+ uint8_t _edits;
+ public:
+ constexpr MatchResult(uint8_t max_edits, uint8_t edits) noexcept
+ : _max_edits(max_edits),
+ _edits(edits)
+ {}
+
+ static constexpr MatchResult make_match(uint8_t max_edits, uint8_t edits) noexcept {
+ return {max_edits, edits};
+ }
+
+ static constexpr MatchResult make_mismatch(uint8_t max_edits) noexcept {
+ return {max_edits, static_cast<uint8_t>(max_edits + 1)};
+ }
+
+ [[nodiscard]] constexpr bool matches() const noexcept { return _edits <= _max_edits; }
+ [[nodiscard]] constexpr uint8_t edits() const noexcept { return _edits; }
+ [[nodiscard]] constexpr uint8_t max_edits() const noexcept { return _max_edits; }
+ };
+
+ struct Impl {
+ virtual ~Impl() = default;
+ [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string* successor_out) const = 0;
+ [[nodiscard]] virtual size_t memory_usage() const noexcept = 0;
+ virtual void dump_as_graphviz(std::ostream& out) const = 0;
+ };
+
+private:
+ std::unique_ptr<Impl> _impl;
+public:
+ explicit LevenshteinDfa(std::unique_ptr<Impl> impl) noexcept;
+ LevenshteinDfa(LevenshteinDfa&&) noexcept;
+ LevenshteinDfa& operator=(LevenshteinDfa&&) noexcept;
+ LevenshteinDfa(const LevenshteinDfa&) = delete;
+ LevenshteinDfa& operator=(const LevenshteinDfa&) = delete;
+ ~LevenshteinDfa();
+
+ /**
+ * Attempts to match the source string `source` with the target string this DFA was
+ * built with, emitting a successor string on mismatch if `successor_out` != nullptr.
+ *
+ * `source` must not contain any null UTF-8 chars.
+ *
+ * Match case:
+ * Iff `source` is _within_ the maximum edit distance, returns a MatchResult with
+ * matches() == true and edits() == the actual edit distance. If `successor_out`
+ * is not nullptr, the string pointed to is _not_ modified.
+ *
+ * Mismatch case:
+ * Iff `source` is _beyond_ the maximum edit distance, returns a MatchResult with
+ * matches() == false.
+ *
+ * Iff `successor_out` is not nullptr, the following holds:
+ * - `successor_out` is modified to contain the next (in byte-wise ordering) possible
+ * _matching_ string S so that there exists no other matching string S' that is
+ * greater than `source` but smaller than S.
+ * - `successor_out` contains UTF-8 bytes that are within what UTF-8 can legally
+ * encode in bitwise form, but the _code points_ they encode may not be valid.
+ * In particular, surrogate pair ranges and U+10FFFF+1 may be encoded, neither of
+ * which are valid UTF-8.
+ *
+ * It is expected that the consumer of `successor_out` is only interested in the
+ * memcmp()-ordering of strings and not whether they are technically valid Unicode.
+ * This should be the case for low-level dictionary data structures etc.
+ *
+ * Memory allocation:
+ * This function does not directly or indirectly allocate any heap memory if either:
+ *
+ * - the input string is within the max edit distance, or
+ * - `successor_out` is nullptr, or
+ * - `successor_out` has sufficient capacity to hold the generated successor
+ *
+ * By reusing the successor string across many calls, this therefore amortizes memory
+ * allocations down to near zero per invocation.
+ */
+ [[nodiscard]] MatchResult match(std::string_view source, std::string* successor_out) const;
+
+ /**
+ * Returns how much memory is used by the underlying DFA representation, in bytes.
+ */
+ [[nodiscard]] size_t memory_usage() const noexcept;
+
+ enum class DfaType {
+ Implicit,
+ Explicit
+ };
+
+ /**
+ * Builds and returns a Levenshtein DFA that matches all strings within `max_edits`
+ * edits of `target_string`. The type of DFA returned is specified by dfa_type.
+ *
+ * `max_edits` must be in {1, 2}. Throws std::invalid_argument if outside range.
+ *
+ * `target_string` must not contain any null UTF-8 chars.
+ */
+ [[nodiscard]] static LevenshteinDfa build(std::string_view target_string,
+ uint8_t max_edits,
+ DfaType dfa_type);
+
+ /**
+ * Same as build() but currently always returns an implicit DFA.
+ */
+ [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits);
+
+ /**
+ * Dumps the DFA as a Graphviz graph in text format to the provided output stream.
+ *
+ * Note: Only supported for _explicit_ DFAs. Trying to call this function on an implicit
+ * DFA will throw a std::runtime_error, as there is no concrete underlying graph
+ * structure to dump.
+ *
+ * Note that only _matching_ state transitions are present in the DFA, and therefore only
+ * such transitions are present in the generated graph. Overall this makes the graph for
+ * longer strings much more manageable, as the number of out-edges from a particular depth
+ * in the graph depends on the max number of edits and not on the length of the string
+ * itself. Otherwise, you'd have a whole bunch of nodes with out-edges to the same terminal
+ * non-matching state node.
+ */
+ void dump_as_graphviz(std::ostream& out) const;
+};
+
+std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos);
+std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt);
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
new file mode 100644
index 00000000000..206b69f8ebe
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -0,0 +1,291 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "dfa_matcher.h"
+#include "levenshtein_dfa.h"
+#include "unicode_utils.h"
+#include <vespa/vespalib/text/utf8.h>
+#include <cassert>
+#include <concepts>
+
+namespace vespalib::fuzzy {
+
+/**
+ * Implementation of algorithm for linear-time k-max edits string matching and successor
+ * string generation over an abstract DFA representation.
+ *
+ * The implementation is agnostic to how the underlying DFA is implemented, but requires
+ * an appropriate adapter that satisfies the DfaMatcher concept contracts.
+ */
+template <uint8_t MaxEdits>
+struct MatchAlgorithm {
+ using MatchResult = LevenshteinDfa::MatchResult;
+
+ static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
+
+ /**
+ * Matches UTF-8 source string `source` against the target DFA, optionally generating
+ * the successor string iff the source string is not within the maximum number of edits
+ * of the target string.
+ *
+ * The actual match loop is very simple: we try to match the DFA as far as we can
+ * before either consuming all input (source string) characters or ending up in a non-
+ * matching state before we have consumed all input. In the former case, we may be in
+ * a matching state (consider matching "foo" with the target string "food"; after
+ * consuming all input we'll be in a matching state with 1 edit). In the latter case,
+ * the input string cannot possible match.
+ *
+ * If we end up in a matching state, all is well. We simply return a MatchResult with
+ * the number of edits the state represents.
+ *
+ * The interesting bit happens the string does _not_ match and we are asked to provide a
+ * _successor_ string that _does_ match and is strictly greater in lexicographic order.
+ *
+ * We lean on some core invariants:
+ *
+ * - The m x n (|source| x |target|) Levenshtein matrix provides, for any m[i, j] with
+ * i in [1, m], j in [1, n], the _minimum possible_ number of edits that can transform
+ * the source string prefix of length `i` to the target string prefix of length `j`.
+ * This means there is no way of transforming said source prefix using _fewer_ edits.
+ *
+ * - Any given DFA state corresponds to a unique row in the Levenshtein matrix, thus
+ * transitively inheriting the invariants of the matrix row elements themselves, such
+ * as representing the minimum number of edits.
+ *
+ * We have two mismatch cases:
+ *
+ * 1. We've matched the entire source string without ending in an accepting state.
+ *
+ * This can only happen if the input is a (possibly edited) prefix of the target string.
+ * Any and all _longer_ strings with this prefix is inherently lexicographically greater,
+ * so we emit the smallest possible suffix that turns prefix || suffix into a matching
+ * string.
+ *
+ * See emit_smallest_matching_suffix() for details.
+ *
+ * 2. We've matched a prefix of the source string without ending in an accepting state.
+ *
+ * This case is trickier than when the entire source string is a prefix, as we cannot
+ * just emit a suffix to the source to create a matching, lexicographically greater string.
+ *
+ * Consider source "foxx" and target "food". There exists no suffix S in "food" that can
+ * turn "foxx" || S into a matching string within k=1 edits.
+ *
+ * So we have to backtrack to somewhere.
+ *
+ * That "somewhere" is the state that maximizes the size of the source prefix while
+ * allowing us to emit a greater suffix.
+ *
+ * For each state we visit, we check if there exists at least one higher out edge than the
+ * one taken out from that state (this is possibly a wildcard edge). If one exists, we
+ * copy the state to `last_state_with_higher_out` and remember the state's source string
+ * prefix as well as the source string character that transitions us away from the state
+ * (this will be our candidate for building a greater suffix).
+ *
+ * When we fail to match the entire source string, we know that last_state_with_higher_out
+ * represents the last possible branching point (and therefore the longest prefix) where
+ * we can substitute in or insert a higher character, in turn creating a greater suffix.
+ *
+ * Proof by contradiction: let `last_state_with_higher_out` be S and assume there exists
+ * a state S' that has a greater source string prefix than S while still allowing for
+ * emitting a lexicographically greater suffix that is within max edits k. We terminate
+ * the match loop once can_match(X) is false for any state X, where X subsumes S by
+ * definition. For S' to exist, it must be possible for a transition to exist from X to
+ * a later state that can have a higher out edge. However, edit distance costs can
+ * never decrease, only stay constant (with matching substitutions) or increase (with
+ * insertions, deletions or non-matching substitutions), so it's impossible to follow
+ * an out-edge from X to any later potentially matching state. Thus, S' can not exist
+ * and we have a contradiction.
+ *
+ * Since we want to generate the smallest possible larger string that matches, we ideally
+ * want to emit a character that is +1 of the source character after the shared prefix.
+ * This is using the "higher out"-character we remembered earlier. We do this if we have
+ * a wildcard out edge (or if there exists an explicit out-edge for value char+1).
+ * Otherwise, we have to follow the highest explicitly present out-edge.
+ *
+ * Once we have emitted one single character that gets us lexicographically higher than
+ * the source string, we then emit the smallest possible suffix to this. This uses the
+ * same minimal suffix generation logic as mismatch case 1).
+ *
+ * See `backtrack_and_emit_greater_suffix()` for details.
+ *
+ * Example:
+ * (This is easiest to follow by looking at examples/food_dfa.svg)
+ *
+ * Source "foxx", target "food" and k=1:
+ *
+ * After matching "fo" with 0 edits we reach a state with out-edges {d, o, *}. This state
+ * has an implicitly higher out-edge (*) and we remember it and the char 'x' for later.
+ * Edge 'x' can only happen via *, so we take that path.
+ *
+ * After matching "fox" with 1 edit we reach a state with out-edges {d, o}. There is
+ * no out-edge for 'x' and the state is not a matching state, so we need to backtrack
+ * and generate a successor.
+ *
+ * We backtrack to the state representing "fo" and emit it as a successor prefix. We
+ * observe that this state has a wildcard out-edge and emit 'x'+1 == 'y' to the successor
+ * string and continue with emitting the smallest suffix. We now have a successor
+ * prefix of "foy", with which we reach the same logical state as we did with "fox"
+ * previously. The smallest out-edge here is 'd', so we take it. This leaves us in an
+ * accepting (matching) state, so suffix generation completes.
+ *
+ * "foxx" -> "foyd"
+ *
+ * Note that it's possible for the prefix to be empty, which results in a successor
+ * that has nothing in common with the source altogether.
+ * Example: "gp" -> "hfood" (+1 char value case)
+ *
+ * Performance note:
+ * Both the input and successor output strings are in UTF-8 format. To avoid doing
+ * duplicate work, we keep track of the byte length of the string prefix that will be
+ * part of the successor and simply copy it verbatim instead of building the string
+ * from converted UTF-32 -> UTF-8 chars as we go.
+ *
+ * TODO we could probably also optimize the smallest suffix generation with this when
+ * we know we can no longer insert any smaller char substitutions and the only way
+ * to complete the string is to emit it verbatim.
+ * - To do this we'd need both the original UTF-8 target string as well as a
+ * secondary vector that maps u32 character index to the corresponding UTF-8 index.
+ * Both trivial to get as part of DFA initialization.
+ */
+ template <DfaMatcher Matcher>
+ static MatchResult match(const Matcher& matcher,
+ std::string_view source,
+ std::string* successor_out)
+ {
+ using StateType = typename Matcher::StateType;
+ vespalib::Utf8Reader u8_reader(source.data(), source.size());
+ uint32_t n_prefix_u8_bytes = 0;
+ uint32_t char_after_prefix = 0;
+ StateType last_state_with_higher_out = StateType{};
+
+ StateType state = matcher.start();
+ while (u8_reader.hasMore()) {
+ const auto u8_pos_before_char = u8_reader.getPos();
+ const uint32_t mch = u8_reader.getChar();
+ if (successor_out && matcher.has_higher_out_edge(state, mch)) {
+ last_state_with_higher_out = state;
+ n_prefix_u8_bytes = u8_pos_before_char;
+ char_after_prefix = mch;
+ }
+ auto maybe_next = matcher.match_input(state, mch);
+ if (matcher.can_match(maybe_next)) {
+ state = maybe_next;
+ } else {
+ // Can never match; find the successor if requested
+ if (successor_out) {
+ *successor_out = source.substr(0, n_prefix_u8_bytes);
+ assert(matcher.valid_state(last_state_with_higher_out));
+ backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out,
+ char_after_prefix, *successor_out);
+ }
+ return MatchResult::make_mismatch(max_edits());
+ }
+ }
+ const auto edits = matcher.match_edit_distance(state);
+ if (edits <= max_edits()) {
+ return MatchResult::make_match(max_edits(), edits);
+ }
+ if (successor_out) {
+ *successor_out = source;
+ emit_smallest_matching_suffix(matcher, state, *successor_out);
+ }
+ return MatchResult::make_mismatch(max_edits());
+ }
+
+ /**
+ * Instantly backtrack to the last possible branching point in the DFA where we can
+ * choose some higher outgoing edge character value and still match the DFA. If the node
+ * has a wildcard edge, we can bump the input char by one and generate the smallest
+ * possible matching suffix to that. Otherwise, choose the smallest out edge that is
+ * greater than the input character at that location and _then_ emit the smallest
+ * matching prefix.
+ *
+ * precondition: `last_node_with_higher_out` has either a wildcard edge or a char match
+ * edge that compares greater than `input_at_branch`.
+ */
+ template <DfaMatcher Matcher>
+ static void backtrack_and_emit_greater_suffix(
+ const Matcher& matcher,
+ typename Matcher::StateParamType last_state_with_higher_out,
+ const uint32_t input_at_branch,
+ std::string& successor)
+ {
+ auto wildcard_state = matcher.match_wildcard(last_state_with_higher_out);
+ if (matcher.can_match(wildcard_state)) {
+ // `input_at_branch` may be U+10FFFF, with +1 being outside legal Unicode _code point_
+ // range but _within_ what UTF-8 can technically _encode_.
+ // We assume that successor-consumers do not care about anything except byte-wise
+ // ordering. This is similar to what RE2's PossibleMatchRange emits to represent a
+ // UTF-8 upper bound, so not without precedent.
+ // If the resulting character corresponds to an existing out-edge we _must_ take it
+ // instead of the wildcard edge, or we'll end up in the wrong state.
+ const auto next_char = input_at_branch + 1;
+ if (!matcher.has_exact_explicit_out_edge(last_state_with_higher_out, next_char)) {
+ append_utf32_char_as_utf8(successor, next_char);
+ emit_smallest_matching_suffix(matcher, wildcard_state, successor);
+ return;
+ } // else: handle exact match below (it will be found as the first higher out edge)
+ }
+ const auto first_highest_edge = matcher.lowest_higher_explicit_out_edge(last_state_with_higher_out, input_at_branch);
+ assert(matcher.valid_edge(first_highest_edge));
+ append_utf32_char_as_utf8(successor, matcher.edge_to_u32char(first_highest_edge));
+ emit_smallest_matching_suffix(matcher, matcher.edge_to_state(last_state_with_higher_out, first_highest_edge), successor);
+ }
+
+ /**
+ * The smallest possible suffix is generated by following the smallest out-edge per state,
+ * until we reach a state that is a match. It is possible that the smallest out edge is a
+ * "wildcard" edge (our terminology), which means that we can insert/substitute an arbitrary
+ * character and still have `can_match(resulting state)` be true. In this case we emit the
+ * smallest possible non-null UTF-8 character (0x01).
+ *
+ * Examples:
+ * (These are easiest to follow by looking at examples/food_dfa.svg)
+ *
+ * Source "fo", target "food" and k=1:
+ *
+ * After matching "fo" we have 1 edit to spare. The smallest valid, non-empty UTF-8 suffix
+ * to this string must necessarily begin with 0x01, so that's what we emit. The smallest
+ * edge we can follow from the resulting state is 'd', and that is a accepting (matching)
+ * state.
+ *
+ * "fo" -> "fo\x01d"
+ *
+ * Source "fx", target "food" and k=1:
+ *
+ * After matching "fx" we have no edits to spare. The smallest character reachable from
+ * the state is 'o' (in fact, it is the only out edge available since we're down to zero
+ * available edits). The next state has an out-edge to 'd' and 'o', and we choose 'd'
+ * since it is smallest. This leaves us in an accepting (matching) state and we terminate
+ * the loop.
+ *
+ * "fx" -> "fxod"
+ */
+ // TODO consider variant for only emitting _prefix of suffix_ to avoid having to generate
+ // the full string? Won't generate a matching string, but will be lexicographically greater.
+ template <DfaMatcher Matcher>
+ static void emit_smallest_matching_suffix(
+ const Matcher& matcher,
+ typename Matcher::StateParamType from,
+ std::string& str)
+ {
+ auto state = from;
+ while (!matcher.is_match(state)) {
+ // If we can take a wildcard path, emit the smallest possible valid UTF-8 character (0x01).
+ // Otherwise, find the smallest char that can eventually lead us to a match.
+ auto wildcard_state = matcher.match_wildcard(state);
+ if (matcher.can_match(wildcard_state)) {
+ str += '\x01';
+ state = wildcard_state;
+ } else {
+ const auto smallest_out_edge = matcher.smallest_explicit_out_edge(state);
+ assert(matcher.valid_edge(smallest_out_edge));
+ append_utf32_char_as_utf8(str, matcher.edge_to_u32char(smallest_out_edge));
+ state = matcher.edge_to_state(state, smallest_out_edge);
+ }
+ }
+ }
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
new file mode 100644
index 00000000000..40cfa5e6409
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
@@ -0,0 +1,175 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <ostream>
+#include <span>
+#include <xxh3.h> // TODO factor out?
+
+namespace vespalib::fuzzy {
+
+// Sentinel U32 char for state stepping that cannot match any target string characters
+constexpr const uint32_t WILDCARD = UINT32_MAX;
+
+/**
+ * diag(n) is the width of the diagonal of the cost matrix that can possibly be
+ * within k edits. This means that for a fixed k, it suffices to maintain state
+ * for up to and including diag(k) consecutive cells for any given matrix row.
+ */
+constexpr inline uint8_t diag(uint8_t k) noexcept {
+ return k*2 + 1;
+}
+
+template <uint8_t MaxEdits>
+struct FixedSparseState {
+private:
+ static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2);
+
+ std::array<uint32_t, diag(MaxEdits)> indices;
+ std::array<uint8_t, diag(MaxEdits)> costs; // elems are 1-1 with indices vector
+ uint8_t sz;
+public:
+ constexpr FixedSparseState() noexcept : indices(), costs(), sz(0) {}
+
+ [[nodiscard]] constexpr bool empty() const noexcept {
+ return (sz == 0);
+ }
+
+ [[nodiscard]] constexpr uint32_t size() const noexcept {
+ return sz;
+ }
+
+ [[nodiscard]] constexpr uint32_t index(uint32_t entry_idx) const noexcept {
+ return indices[entry_idx];
+ }
+
+ [[nodiscard]] constexpr uint8_t cost(uint32_t entry_idx) const noexcept {
+ return costs[entry_idx];
+ }
+
+ // Precondition: !empty()
+ [[nodiscard]] constexpr uint32_t last_index() const noexcept {
+ return indices[sz - 1];
+ }
+
+ // Precondition: !empty()
+ [[nodiscard]] constexpr uint8_t last_cost() const noexcept {
+ return costs[sz - 1];
+ }
+
+ void append(uint32_t index, uint8_t cost) noexcept {
+ assert(sz < diag(MaxEdits));
+ indices[sz] = index;
+ costs[sz] = cost;
+ ++sz;
+ }
+
+ constexpr bool operator==(const FixedSparseState& rhs) const noexcept {
+ if (sz != rhs.sz) {
+ return false;
+ }
+ return (std::equal(indices.begin(), indices.begin() + sz, rhs.indices.begin()) &&
+ std::equal(costs.begin(), costs.begin() + sz, rhs.costs.begin()));
+ }
+
+ struct hash {
+ size_t operator()(const FixedSparseState& s) const noexcept {
+ static_assert(std::is_same_v<uint32_t, std::decay_t<decltype(s.indices[0])>>);
+ static_assert(std::is_same_v<uint8_t, std::decay_t<decltype(s.costs[0])>>);
+ // FIXME GCC 12.2 worse-than-useless(tm) warning false positives :I
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+ return (XXH3_64bits(s.indices.data(), s.sz * sizeof(uint32_t)) ^
+ XXH3_64bits(s.costs.data(), s.sz));
+#pragma GCC diagnostic pop
+ }
+ };
+};
+
+/**
+ * Prints sparse states as a single matrix row. Columns prior to any state index
+ * are printed explicitly as '-' characters to make states line up when printed.
+ *
+ * Example output for the state (2:1, 3:1):
+ *
+ * [-, -, 1, 1]
+ *
+ * Only meant as a debugging aid during development, as states with high indices
+ * will emit very large strings.
+ */
+template <uint8_t MaxEdits> [[maybe_unused]]
+std::ostream& operator<<(std::ostream& os, const FixedSparseState<MaxEdits>& s) {
+ os << "[";
+ size_t last_idx = 0;
+ for (size_t i = 0; i < s.size(); ++i) {
+ if (i != 0) {
+ os << ", ";
+ }
+ for (size_t j = last_idx; j < s.indices[i]; ++j) {
+ os << "-, ";
+ }
+ last_idx = s.indices[i] + 1;
+ os << static_cast<uint32_t>(s.costs[i]);
+ }
+ os << "]";
+ return os;
+}
+
+template <uint8_t MaxEdits>
+struct FixedMaxEditsTransitions {
+ static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2);
+
+ std::array<uint32_t, diag(MaxEdits)> out_u32_chars;
+ uint8_t size;
+
+ constexpr FixedMaxEditsTransitions() noexcept : out_u32_chars(), size(0) {}
+
+ [[nodiscard]] constexpr bool has_char(uint32_t u32ch) const noexcept {
+ for (uint8_t i = 0; i < size; ++i) {
+ if (out_u32_chars[i] == u32ch) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void add_char(uint32_t u32ch) noexcept {
+ if (!has_char(u32ch)) {
+ assert(size < diag(MaxEdits));
+ out_u32_chars[size] = u32ch;
+ ++size;
+ }
+ }
+
+ constexpr std::span<const uint32_t> u32_chars() const noexcept {
+ return {out_u32_chars.begin(), out_u32_chars.begin() + size};
+ }
+
+ constexpr std::span<uint32_t> u32_chars() noexcept {
+ return {out_u32_chars.begin(), out_u32_chars.begin() + size};
+ }
+
+ void sort() noexcept {
+ // TODO use custom sorting networks for fixed array sizes <= 5?
+ // FIXME GCC 12.2 worse-than-useless(tm) warning false positives :I
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+ std::sort(out_u32_chars.begin(), out_u32_chars.begin() + size);
+#pragma GCC diagnostic pop
+ }
+};
+
+template <uint8_t MaxEdits>
+struct FixedMaxEditDistanceTraits {
+ static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2);
+ using StateType = FixedSparseState<MaxEdits>;
+ using TransitionsType = FixedMaxEditsTransitions<MaxEdits>;
+ constexpr static uint8_t max_edits() noexcept {
+ return MaxEdits;
+ }
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
new file mode 100644
index 00000000000..648be234562
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
@@ -0,0 +1,108 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "unicode_utils.h"
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <stdexcept>
+
+namespace vespalib::fuzzy {
+
+std::vector<uint32_t> utf8_string_to_utf32(std::string_view str) {
+ vespalib::stringref ch_str(str.data(), str.size());
+ vespalib::Utf8Reader utf8_reader(ch_str);
+ std::vector<uint32_t> u32ret;
+ u32ret.reserve(str.size()); // Will over-allocate for all non-ASCII
+ while (utf8_reader.hasMore()) {
+ u32ret.emplace_back(utf8_reader.getChar());
+ }
+ return u32ret;
+}
+
+std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str) {
+ return utf8_string_to_utf32(std::string_view(reinterpret_cast<const char*>(u8str.data()), u8str.size()));
+}
+
+[[noreturn]] void throw_bad_code_point(uint32_t codepoint) __attribute__((noinline));
+[[noreturn]] void throw_bad_code_point(uint32_t codepoint) {
+ throw std::invalid_argument(make_string("invalid UTF-32 codepoint: U+%04X (%u)", codepoint, codepoint));
+}
+
+namespace {
+
+/**
+ * Encodes a single UTF-32 `codepoint` to a 1-4 byte UTF-8 sequence.
+ * `
+ * `u8buf` must point to a buffer with at least 4 writable bytes.
+ *
+ * Returns the number of bytes written.
+ *
+ * See comments on append_utf32_char_as_utf8() as to why this is not a generic UTF-8
+ * encoding function that can be used in all possible scenarios.
+ */
+[[nodiscard]] uint8_t encode_utf8_char(uint32_t codepoint, unsigned char* u8buf) {
+ constexpr const uint8_t low_6bits_mask = 0x3F;
+
+ // Yanked and modified from utf8.cpp:
+ if (codepoint < 0x80) {
+ u8buf[0] = (char) codepoint;
+ return 1;
+ } else if (codepoint < 0x800) {
+ char low6 = (codepoint & low_6bits_mask);
+ low6 |= 0x80;
+ codepoint >>= 6;
+ char first5 = codepoint;
+ first5 |= 0xC0;
+ u8buf[0] = first5;
+ u8buf[1] = low6;
+ return 2;
+ } else if (codepoint < 0x10000) {
+ char low6 = (codepoint & low_6bits_mask);
+ low6 |= 0x80;
+
+ codepoint >>= 6;
+ char mid6 = (codepoint & low_6bits_mask);
+ mid6 |= 0x80;
+
+ codepoint >>= 6;
+ char first4 = codepoint;
+ first4 |= 0xE0;
+
+ u8buf[0] = first4;
+ u8buf[1] = mid6;
+ u8buf[2] = low6;
+ return 3;
+ } else if (codepoint <= 0x110000) { // Explicitly _include_ U+10FFFF + 1!
+ char low6 = (codepoint & low_6bits_mask);
+ low6 |= 0x80;
+
+ codepoint >>= 6;
+ char mid6 = (codepoint & low_6bits_mask);
+ mid6 |= 0x80;
+
+ codepoint >>= 6;
+ char hi6 = (codepoint & low_6bits_mask);
+ hi6 |= 0x80;
+
+ codepoint >>= 6;
+ char first3 = codepoint;
+ first3 |= 0xF0;
+
+ u8buf[0] = first3;
+ u8buf[1] = hi6;
+ u8buf[2] = mid6;
+ u8buf[3] = low6;
+ return 4;
+ } else {
+ throw_bad_code_point(codepoint);
+ }
+}
+
+} // anon ns
+
+// TODO optimize inlined in header for case where u32_char is < 0x80?
+void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char) {
+ unsigned char u8buf[4];
+ uint8_t u8bytes = encode_utf8_char(u32_char, u8buf);
+ out_str.append(reinterpret_cast<const char*>(u8buf), u8bytes);
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
new file mode 100644
index 00000000000..8627b01ff6a
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
@@ -0,0 +1,33 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+std::vector<uint32_t> utf8_string_to_utf32(std::string_view str);
+
+std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str);
+
+/**
+ * Encodes a single UTF-32 codepoint `u32_char` to a 1-4 byte UTF-8 sequence and
+ * appends it to `out_str.`
+ *
+ * Note that this will happily encode code points that aren't technically part of
+ * the valid UTF-8 range, but which will still be correct in memcmp() byte-wise
+ * ordering, which is the API contract we expose.
+ *
+ * In particular, this includes:
+ * - high/low surrogate ranges U+D800 through U+DFFF (surrogate pairs not allowed
+ * in UTF-8)
+ * - U+10FFFF + 1 (outside max code point range by one)
+ *
+ * ... So don't copy this function for use as a general UTF-8 emitter, as it is not
+ * _technically_ conformant!
+ */
+void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char);
+
+}