table dfa

author: Håvard Pettersen <havardpe@yahooinc.com> 2023-09-18 15:04:26 +0000
committer: Håvard Pettersen <havardpe@yahooinc.com> 2023-09-25 14:54:38 +0000
commit: 985306887f246dafaeb0315ef3c3331b0e33e114 (patch)
tree: e03de7b856a4ee91e8e18dd786b0788b3da1c75b /vespalib
parent: 7facdd6177063f772c497000b9c12e4653a2db83 (diff)
10 files changed, 900 insertions, 8 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index c1d7e17b457..56dcd9abdf6 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -96,6 +96,7 @@ vespa_define_module(
     src/tests/fileheader
     src/tests/floatingpointtype
     src/tests/fuzzy
+    src/tests/fuzzy/table_dfa
     src/tests/gencnt
     src/tests/growablebytebuffer
     src/tests/guard
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index c235cb99509..02df6edc370 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -82,7 +82,8 @@ INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes,
                          Combine(Values(LevenshteinDfa::Casing::Uncased,
                                         LevenshteinDfa::Casing::Cased),
                                  Values(LevenshteinDfa::DfaType::Explicit,
-                                        LevenshteinDfa::DfaType::Implicit)),
+                                        LevenshteinDfa::DfaType::Implicit,
+                                        LevenshteinDfa::DfaType::Table)),
                          LevenshteinDfaTest::stringify_params);
 
 // Same as existing non-DFA Levenshtein tests, but with some added instantiations
@@ -233,7 +234,8 @@ struct LevenshteinDfaCasingTest : TestWithParam<LevenshteinDfa::DfaType> {
 INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
                          LevenshteinDfaCasingTest,
                          Values(LevenshteinDfa::DfaType::Explicit,
-                                LevenshteinDfa::DfaType::Implicit),
+                                LevenshteinDfa::DfaType::Implicit,
+                                LevenshteinDfa::DfaType::Table),
                          PrintToStringParamName());
 
 TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) {
@@ -315,7 +317,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
                          Combine(Values(LevenshteinDfa::Casing::Uncased,
                                         LevenshteinDfa::Casing::Cased),
                                  Values(LevenshteinDfa::DfaType::Explicit,
-                                        LevenshteinDfa::DfaType::Implicit),
+                                        LevenshteinDfa::DfaType::Implicit,
+                                        LevenshteinDfa::DfaType::Table),
                                  Values(1, 2)),
                          LevenshteinDfaSuccessorTest::stringify_params);
 
diff --git a/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt
new file mode 100644
index 00000000000..1017ac99564
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vespalib_fuzzy_table_dfa_test_app TEST
+        SOURCES
+        table_dfa_test.cpp
+        DEPENDS
+        vespalib
+        GTest::GTest
+        )
+vespa_add_test(NAME vespalib_fuzzy_table_dfa_test_app COMMAND vespalib_fuzzy_table_dfa_test_app)
diff --git a/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
new file mode 100644
index 00000000000..7782a39c3c7
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
@@ -0,0 +1,298 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/fuzzy/table_dfa.hpp>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <set>
+
+using namespace ::testing;
+using namespace vespalib::fuzzy;
+
+// test/experiment with low-level concepts underlying the construction
+// of the tables used in the table-driven dfa implementation.
+
+TEST(TableDfaTest, position) {
+    Position pos1 = Position::start();
+    EXPECT_EQ(pos1.index, 0);
+    EXPECT_EQ(pos1.edits, 0);
+    Position pos2(2, 3);
+    EXPECT_EQ(pos2.index, 2);
+    EXPECT_EQ(pos2.edits, 3);
+}
+
+TEST(TableDfaTest, position_equality) {
+    Position pos1(0, 0);
+    Position pos2(0, 1);
+    Position pos3(1, 0);
+    EXPECT_TRUE(pos1 == pos1);
+    EXPECT_FALSE(pos1 == pos2);
+    EXPECT_FALSE(pos1 == pos2);
+}
+
+TEST(TableDfaTest, position_sort_order) {
+    std::vector<Position> list;
+    list.emplace_back(0,1);
+    list.emplace_back(0,0);
+    list.emplace_back(1,0);
+    list.emplace_back(1,1);
+    std::sort(list.begin(), list.end());
+    EXPECT_EQ(list[0].index, 0);
+    EXPECT_EQ(list[0].edits, 0);
+    EXPECT_EQ(list[1].index, 1);
+    EXPECT_EQ(list[1].edits, 0);
+    EXPECT_EQ(list[2].index, 0);
+    EXPECT_EQ(list[2].edits, 1);
+    EXPECT_EQ(list[3].index, 1);
+    EXPECT_EQ(list[3].edits, 1);
+}
+
+TEST(TableDfaTest, position_subsumption) {
+    Position pos1(0, 0);
+    Position pos2(0, 1);
+    Position pos3(0, 2);
+
+    Position pos4(1, 0);
+    Position pos5(1, 1);
+    Position pos6(1, 2);
+    
+    Position pos7(2, 0);
+    Position pos8(2, 1);
+    Position pos9(2, 2);
+    
+    EXPECT_FALSE(pos1.subsumes(pos1));
+    EXPECT_TRUE(pos1.subsumes(pos2));
+    EXPECT_TRUE(pos1.subsumes(pos3));
+    EXPECT_FALSE(pos1.subsumes(pos4));
+    EXPECT_TRUE(pos1.subsumes(pos5));
+    EXPECT_TRUE(pos1.subsumes(pos6));
+    EXPECT_FALSE(pos1.subsumes(pos7));
+    EXPECT_FALSE(pos1.subsumes(pos8));
+    EXPECT_TRUE(pos1.subsumes(pos9));
+
+    EXPECT_FALSE(pos5.subsumes(pos1));
+    EXPECT_FALSE(pos5.subsumes(pos2));
+    EXPECT_TRUE(pos5.subsumes(pos3));
+    EXPECT_FALSE(pos5.subsumes(pos4));
+    EXPECT_FALSE(pos5.subsumes(pos5));
+    EXPECT_TRUE(pos5.subsumes(pos6));
+    EXPECT_FALSE(pos5.subsumes(pos7));
+    EXPECT_FALSE(pos5.subsumes(pos8));
+    EXPECT_TRUE(pos5.subsumes(pos9));
+}
+
+TEST(TableDfaTest, position_materialization) {
+    EXPECT_EQ(Position(1,1).materialize(0).index, 0);
+    EXPECT_EQ(Position(1,1).materialize(1).index, 1);
+    EXPECT_EQ(Position(1,1).materialize(2).index, 2);
+    EXPECT_EQ(Position(1,1).materialize(0).edits, 2);
+    EXPECT_EQ(Position(1,1).materialize(1).edits, 1);
+    EXPECT_EQ(Position(1,1).materialize(2).edits, 2);
+}
+
+TEST(TableDfaTest, position_to_string) {
+    Position pos1(0, 0);
+    Position pos2(1, 2);
+    Position pos3(2, 3);
+    EXPECT_EQ(pos1.to_string(), fmt("0#0"));
+    EXPECT_EQ(pos2.to_string(), fmt("1#2"));
+    EXPECT_EQ(pos3.to_string(), fmt("2#3"));
+}
+
+TEST(TableDfaTest, state_creation_reorder) {
+    EXPECT_EQ(State::create<5>({{0,1},{2,0}}).to_string(), fmt("{2#0,0#1}"));
+    EXPECT_EQ(State::create<5>({{2,0},{0,0}}).to_string(), fmt("{0#0,2#0}"));
+}
+
+TEST(TableDfaTest, state_creation_duplicate_removal) {
+    EXPECT_EQ(State::create<5>({{0,0},{0,0},{2,1},{2,1}}).to_string(), fmt("{0#0,2#1}"));
+}
+
+TEST(TableDfaTest, state_creation_edit_cutoff) {
+    EXPECT_EQ(State::create<2>({{0,0},{5,2},{10,3}}).to_string(), fmt("{0#0,5#2}"));
+}
+
+TEST(TableDfaTest, state_creation_subsumption_collapsing) {
+    EXPECT_EQ(State::create<2>({{0,0},{1,1}}).to_string(), fmt("{0#0}"));
+    EXPECT_EQ(State::create<2>({{0,1},{1,0}}).to_string(), fmt("{1#0}"));
+    EXPECT_EQ(State::create<2>({{0,0},{2,2}}).to_string(), fmt("{0#0}"));
+    EXPECT_EQ(State::create<2>({{0,2},{2,0}}).to_string(), fmt("{2#0}"));
+}
+
+TEST(TableDfaTest, state_normalization) {
+    auto state1 = State::create<2>({{2,1},{3,1}});
+    auto state2 = State::create<2>({{5,0},{3,1}});
+    EXPECT_EQ(state1.to_string(), fmt("{2#1,3#1}"));
+    EXPECT_EQ(state2.to_string(), fmt("{5#0,3#1}"));
+    EXPECT_EQ(state1.normalize(), 2);
+    EXPECT_EQ(state2.normalize(), 3);
+    EXPECT_EQ(state1.to_string(), fmt("{0#1,1#1}"));
+    EXPECT_EQ(state2.to_string(), fmt("{2#0,0#1}"));
+}
+
+TEST(TableDfaTest, state_repo) {
+    StateRepo repo;
+    EXPECT_EQ(repo.state_to_idx(State::failed()), 0);
+    EXPECT_EQ(repo.state_to_idx(State::start()), 1);
+    EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2);
+    EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3);
+    EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2);
+    EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3);
+    EXPECT_EQ(repo.size(), 4);
+    EXPECT_EQ(repo.idx_to_state(0).to_string(), fmt("{}"));
+    EXPECT_EQ(repo.idx_to_state(1).to_string(), fmt("{0#0}"));
+    EXPECT_EQ(repo.idx_to_state(2).to_string(), fmt("{0#0,1#0}"));
+    EXPECT_EQ(repo.idx_to_state(3).to_string(), fmt("{0#0,2#1}"));
+}
+
+TEST(TableDfaTest, expand_bits) {
+    auto yes = expand_bits<2>(0x1f);
+    auto no = expand_bits<2>(0x00);
+    auto odd = expand_bits<2>(0x0a);
+    auto even = expand_bits<2>(0x15);
+    ASSERT_EQ(yes.size(), 5);
+    ASSERT_EQ(no.size(), 5);
+    ASSERT_EQ(odd.size(), 5);
+    ASSERT_EQ(even.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+        EXPECT_TRUE(yes[i]);
+        EXPECT_FALSE(no[i]);
+        EXPECT_EQ(odd[i], bool(i % 2 == 1));
+        EXPECT_EQ(even[i], bool(i % 2 == 0));
+    }
+}
+
+TEST(TableDfaTest, format_bits) {
+    EXPECT_EQ(format_vector(expand_bits<1>(0)), fmt("[0,0,0]"));
+    EXPECT_EQ(format_vector(expand_bits<1>(7)), fmt("[1,1,1]"));
+    EXPECT_EQ(format_vector(expand_bits<1>(5)), fmt("[1,0,1]"));
+    EXPECT_EQ(format_vector(expand_bits<1>(2)), fmt("[0,1,0]"));
+    EXPECT_EQ(format_vector(expand_bits<2>(31)), fmt("[1,1,1,1,1]"));
+    EXPECT_EQ(format_vector(expand_bits<2>(21)), fmt("[1,0,1,0,1]"));
+    EXPECT_EQ(format_vector(expand_bits<2>(31), true), fmt("11111"));
+    EXPECT_EQ(format_vector(expand_bits<2>(21), true), fmt("10101"));
+}
+
+template <uint8_t N>
+void list_states(bool count_only = false) {
+    auto repo = make_state_repo<N>();
+    EXPECT_EQ(num_states<N>(), repo.size());
+    fprintf(stderr, "max_edits: %u, number of states: %zu\n", N, repo.size());
+    if (!count_only) {
+        for (uint32_t i = 0; i < repo.size(); ++i) {
+            fprintf(stderr, "  state %u: %s\n", i, repo.idx_to_state(i).to_string().c_str());
+        }
+    }
+}
+
+TEST(TableDfaTest, list_states_for_max_edits_1) { list_states<1>(); }
+TEST(TableDfaTest, list_states_for_max_edits_2) { list_states<2>(); }
+TEST(TableDfaTest, count_states_for_max_edits_3) { list_states<3>(true); }
+
+template <uint8_t N>
+void list_edits() {
+    auto repo = make_state_repo<N>();
+    fprintf(stderr,
+            "per state, listing the minimal number of edits needed\n"
+            "to reach offsets at and beyond its minimal boundary\n");
+    for (uint32_t i = 0; i < repo.size(); ++i) {
+        const State &state = repo.idx_to_state(i);
+        fprintf(stderr, "%-23s : %s\n", state.to_string().c_str(),
+                format_vector(state.make_edit_vector<N>()).c_str());
+    }
+}
+
+TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_1) { list_edits<1>(); }
+TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_2) { list_edits<2>(); }
+
+template <uint8_t N>
+void list_transitions() {
+    auto repo = make_state_repo<N>();
+    for (uint32_t idx = 0; idx < repo.size(); ++idx) {
+        const State &state = repo.idx_to_state(idx);
+        for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+            auto bits = expand_bits<N>(i);
+            State new_state = state.next<N>(bits);
+            uint32_t step = new_state.normalize();
+            uint32_t new_idx = repo.state_to_idx(new_state);
+            ASSERT_LT(new_idx, repo.size());
+            fprintf(stderr, "%u:%s,i --%s--> %u:%s,%s\n", idx, state.to_string().c_str(),
+                    format_vector(bits).c_str(), new_idx, new_state.to_string().c_str(),
+                    (step == 0) ? "i" : fmt("i+%u", step).c_str());
+        }
+    }
+}
+
+TEST(TableDfaTest, list_transitions_for_max_edits_1) { list_transitions<1>(); }
+
+// Simulate all possible ways we can approach the end of the word we
+// are matching. Verify that no transition taken can produce a state
+// with a minimal boundary that exceeds the boundary of the word
+// itself. Verifying this will enable us to not care about word size
+// while simulating the dfa.
+template <uint8_t N>
+void verify_word_end_boundary() {
+    auto repo = make_state_repo<N>();
+    using StateSet = std::set<uint32_t>;
+    std::vector<StateSet> active(window_size<N>() + 1);
+    for (size_t i = 1; i < repo.size(); ++i) {
+        active[0].insert(i);
+    }
+    EXPECT_EQ(active.size(), window_size<N>() + 1);
+    EXPECT_EQ(active[0].size(), repo.size() - 1);
+    fprintf(stderr, "verifying word end for max edits %u\n", N);
+    uint32_t edge_shape = 0;
+    for (uint32_t active_idx = 0; active_idx < active.size(); ++active_idx) {
+        fprintf(stderr, "  edge shape: %s, max step: %zu, active_states: %zu\n",
+                format_vector(expand_bits<N>(edge_shape)).c_str(), active.size() - active_idx - 1, active[active_idx].size());
+        for (uint32_t idx: active[active_idx]) {
+            const State &state = repo.idx_to_state(idx);
+            for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+                if ((i & edge_shape) == 0) {
+                    State new_state = state.next<N>(expand_bits<N>(i));
+                    uint32_t step = new_state.normalize();
+                    uint32_t new_idx = repo.state_to_idx(new_state);
+                    ASSERT_LT(new_idx, repo.size());
+                    if (new_idx != 0) {
+                        ASSERT_GT(active.size(), active_idx + step);
+                        active[active_idx + step].insert(new_idx);
+                    }
+                }
+            }
+        }
+        edge_shape = (edge_shape << 1) + 1;
+    }
+    EXPECT_EQ(edge_shape, (1 << (window_size<N>() + 1)) - 1);
+    while (!active.back().empty()) {
+        fprintf(stderr, "  residue states after word end: %zu\n", active.back().size());
+        StateSet residue;
+        for (uint32_t idx: active.back()) {
+            const State &state = repo.idx_to_state(idx);
+            State new_state = state.next<N>(expand_bits<N>(0));
+            uint32_t step = new_state.normalize();
+            uint32_t new_idx = repo.state_to_idx(new_state);
+            ASSERT_LT(new_idx, repo.size());
+            ASSERT_EQ(step, 0);
+            if (new_idx != 0) {
+                residue.insert(new_idx);
+            }
+        }
+        active.back() = std::move(residue);
+    }
+}
+
+TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_1) {
+    verify_word_end_boundary<1>();
+}
+
+TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_2) {
+    verify_word_end_boundary<2>();
+}
+
+TEST(TableDfaTest, graphviz_for_food_with_max_edits_1) {
+    auto dfa = LevenshteinDfa::build("food", 1, LevenshteinDfa::Casing::Cased, LevenshteinDfa::DfaType::Table);
+    std::ostringstream out;
+    dfa.dump_as_graphviz(out);
+    fprintf(stderr, "memory usage: %zu\n", dfa.memory_usage());
+    fprintf(stderr, "%s", out.str().c_str());
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
index 5e8d29980cd..8ccef84d969 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(vespalib_vespalib_fuzzy OBJECT
         implicit_levenshtein_dfa.cpp
         levenshtein_dfa.cpp
         levenshtein_distance.cpp
+        table_dfa.cpp
         unicode_utils.cpp
     DEPENDS
 )
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index 1caae408176..5f6d0ae9956 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -1,6 +1,7 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "explicit_levenshtein_dfa.h"
 #include "implicit_levenshtein_dfa.h"
+#include "table_dfa.h"
 #include "levenshtein_dfa.h"
 #include "unicode_utils.h"
 #include <vespa/vespalib/util/stringfmt.h>
@@ -53,14 +54,19 @@ LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max
         } else { // max_edits == 2
             return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased));
         }
-    } else { // DfaType::Explicit
+    } else if(dfa_type == DfaType::Explicit) {
         if (max_edits == 1) {
             return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa();
         } else { // max_edits == 2
             return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa();
         }
+    } else { // DfaType::Table
+        if (max_edits == 1) {
+            return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased));
+        } else { // max_edits == 2
+            return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased));
+        }
     }
-
 }
 
 LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) {
@@ -87,9 +93,11 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mo
 std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) {
     if (dt == LevenshteinDfa::DfaType::Implicit) {
         os << "Implicit";
-    } else {
-        assert(dt == LevenshteinDfa::DfaType::Explicit);
+    } else if (dt == LevenshteinDfa::DfaType::Explicit) {
         os << "Explicit";
+    } else {
+        assert(dt == LevenshteinDfa::DfaType::Table);
+        os << "Table";
     }
     return os;
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index c6ca06d4de3..88261bef62c 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -231,7 +231,8 @@ public:
 
     enum class DfaType {
         Implicit,
-        Explicit
+        Explicit,
+        Table
     };
 
     /**
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp
new file mode 100644
index 00000000000..943349818fb
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp
@@ -0,0 +1,10 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "table_dfa.hpp"
+
+namespace vespalib::fuzzy {
+
+template class TableDfa<1>;
+template class TableDfa<2>;
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
new file mode 100644
index 00000000000..dc511a42d04
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
@@ -0,0 +1,64 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "levenshtein_dfa.h"
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+/**
+ * This implementation is based on the paper 'Fast string correction
+ * with Levenshtein automata' from 2002 by Klaus U. Schulz and Stoyan
+ * Mihov.
+ *
+ * Given the maximal distance N, a generic parameterized transition
+ * table is calculated up-front. When a specific word is given, a
+ * simple lookup structure is created to enumerate the possible
+ * characteristic vectors for each position in the given
+ * word. Together, these structures can be used to simulate the
+ * traversal of a hypothetical Levenshtein dfa that will never be
+ * created.
+ *
+ * Approaching the end of the word is handled by padding the
+ * characteristic vectors with 0 bits for everything after the word
+ * ends. In addition, a unit test verifies that there is no possible
+ * sequence of events that leads to the minimal boundary of the state
+ * exceeding the boundary of the word itself. This means that the
+ * simulated dfa can be stepped freely without checking for word size.
+ **/
+template <uint8_t N>
+class TableDfa final : public LevenshteinDfa::Impl
+{
+public:
+    // characteristic vector for a specific input value indicating how
+    // it matches the window starting at the minimal boundary.
+    struct CV {
+        uint32_t input;
+        uint32_t match;
+        CV() noexcept : input(0), match(0) {}
+    };
+    static constexpr size_t window_size() { return 2 * N + 1; }
+    struct Lookup {
+        std::array<CV, window_size()> list;
+        Lookup() noexcept : list() {}
+    };
+    
+private:
+    const void               *_tfa;
+    const std::vector<Lookup> _lookup;
+    const bool                _is_cased;
+
+    static std::vector<Lookup> make_lookup(const std::vector<uint32_t> &str);
+    
+public:
+    using MatchResult = LevenshteinDfa::MatchResult;
+    TableDfa(std::vector<uint32_t> str, bool is_cased);
+    ~TableDfa() override;
+    [[nodiscard]] MatchResult match(std::string_view source) const override;
+    [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override;
+    [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const override;
+    [[nodiscard]] size_t memory_usage() const noexcept override;
+    void dump_as_graphviz(std::ostream& os) const override;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
new file mode 100644
index 00000000000..0693343007e
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
@@ -0,0 +1,497 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "table_dfa.h"
+#include "match_algorithm.hpp"
+#include <vespa/vespalib/util/stringfmt.h>
+#include <cassert>
+#include <stdexcept>
+#include <algorithm>
+#include <map>
+
+namespace vespalib::fuzzy {
+
+namespace {
+
+using vespalib::make_string_short::fmt;
+
+// It is useful to know the number of states compile time to be able
+// to pack lookup tables better.
+template <uint8_t N> constexpr size_t num_states();
+template <> constexpr size_t num_states<1>() { return 6; }
+template <> constexpr size_t num_states<2>() { return 31; }
+template <> constexpr size_t num_states<3>() { return 197; }
+
+template <uint8_t N> constexpr size_t window_size() { return 2 * N + 1; }
+template <uint8_t N> constexpr size_t num_transitions() { return 1 << window_size<N>(); }
+
+
+auto diff(auto a, auto b) { return (a > b) ? (a - b) : (b - a); }
+
+// A Position combines an index into a word being matched with the
+// number of edits needed to get there. This maps directly onto a
+// specific state in the NFA used to match a word. Note that the sort
+// order prefers low edits over low indexs. This is to ensure that a
+// position that subsumes another position will always sort before it.
+struct Position {
+    uint32_t index;
+    uint32_t edits;
+    Position(uint32_t index_in, uint32_t edits_in) noexcept
+      : index(index_in), edits(edits_in) {}
+    static Position start() noexcept { return Position(0,0); }
+    bool subsumes(const Position &rhs) const noexcept {
+        if (edits >= rhs.edits) {
+            return false;
+        }
+        return diff(index, rhs.index) <= (rhs.edits - edits);
+    }
+    Position materialize(uint32_t target_index) const noexcept {
+        return Position(target_index, edits + diff(index, target_index));
+    }
+    bool operator==(const Position &rhs) const noexcept {
+        return (index == rhs.index) && (edits == rhs.edits);
+    }
+    bool operator<(const Position &rhs) const noexcept {
+        return std::tie(edits,index) < std::tie(rhs.edits, rhs.index);
+    }
+    template <uint8_t N>
+    void add_elementary_transitions(const std::vector<bool> &bits, std::vector<Position> &dst) const {
+        assert(bits.size() > index);
+        if (!bits[index]) {
+            dst.emplace_back(index, edits + 1);
+            dst.emplace_back(index + 1, edits + 1);
+        }
+        for (uint32_t e = 0; (edits + e) <= N; ++e) {
+            assert(bits.size() > (index + e));
+            if (bits[index + e]) {
+                dst.emplace_back(index + e + 1, edits + e);
+            }
+        }
+    }
+    vespalib::string to_string() const { return fmt("%u#%u", index, edits); }
+};
+
+// A State is a collection of different Positions that do not subsume
+// each other. If the minimal boundary of a state is larger than 0, it
+// can be lifted from the state in a normalizing operation that will
+// renumber the position indexes such that the minimal boundary of the
+// state becomes 0. This is to allow parameterized states where the
+// general progress of matching the string (minimal boundary of
+// non-normalized state) is untangled from the local competing edit
+// alternatives (normalized state).
+struct State {
+    std::vector<Position> list;
+    State() noexcept : list() {}
+    static State failed() noexcept { return State(); }
+    static State start() {
+        State result;
+        result.list.push_back(Position::start());
+        return result;
+    }
+    bool operator<(const State &rhs) const {
+        return list < rhs.list;
+    }
+    uint32_t minimal_boundary() const noexcept {
+        if (list.empty()) {
+            return 0;
+        }
+        uint32_t min = list[0].index;
+        for (size_t i = 1; i < list.size(); ++i) {
+            min = std::min(min, list[i].index);
+        }
+        return min;
+    }
+    uint32_t normalize() {
+        uint32_t min = minimal_boundary();
+        if (min > 0) {
+            for (auto &entry: list) {
+                entry.index -= min;
+            }
+        }
+        return min;
+    }
+    template <uint8_t N>
+    static State create(std::vector<Position> list_in) {
+        State result;
+        auto want = [&result](Position pos) {
+                        if (pos.edits > N) {
+                            return false;
+                        }
+                        for (const auto &old_pos: result.list) {
+                            if (old_pos == pos || old_pos.subsumes(pos)) {
+                                return false;
+                            }
+                        }
+                        return true;
+                    };
+        std::sort(list_in.begin(), list_in.end());
+        for (const auto &pos: list_in) {
+            if (want(pos)) {
+                result.list.push_back(pos);
+            }
+        }
+        return result;
+    }
+    template <uint8_t N>
+    State next(const std::vector<bool> &bits) const {
+        std::vector<Position> tmp;
+        for (const auto &pos: list) {
+            pos.add_elementary_transitions<N>(bits, tmp);
+        }
+        return create<N>(std::move(tmp));
+    }
+    template <uint8_t N>
+    std::vector<uint8_t> make_edit_vector() const {
+        std::vector<uint8_t> result(window_size<N>(), N + 1);
+        for (const auto &pos: list) {
+            for (uint32_t i = 0; i < window_size<N>(); ++i) {
+                result[i] = std::min(result[i], uint8_t(pos.materialize(i).edits));
+            }
+        }
+        return result;
+    }
+    vespalib::string to_string() const {
+        vespalib::string result = "{";
+        for (size_t i = 0; i < list.size(); ++i) {
+            if (i > 0) {
+                result.append(",");
+            }
+            result.append(list[i].to_string());
+        }
+        result.append("}");
+        return result;
+    }
+};
+
+// Keeps track of unique states, assigning an integer value to each
+// state. Only states with minimal boundary 0 is allowed to be
+// inserted into a state repo. Each repo is seeded with the empty
+// state (0) and the start state (1). An assigned integer value can be
+// mapped back into the originating state.
+struct StateRepo {
+    using Map = std::map<State,uint32_t>;
+    using Ref = Map::iterator;
+    Map seen;
+    std::vector<Ref> refs;
+    StateRepo() noexcept : seen(), refs() {
+        auto failed_idx = state_to_idx(State::failed());
+        auto start_idx = state_to_idx(State::start());
+        assert(failed_idx == 0);
+        assert(start_idx == 1);
+    }
+    ~StateRepo();
+    size_t size() const { return seen.size(); }
+    uint32_t state_to_idx(const State &state) {
+        assert(state.minimal_boundary() == 0);
+        uint32_t next = refs.size();
+        auto [pos, inserted] = seen.emplace(state, next);
+        if (inserted) {
+            refs.push_back(pos);
+        }
+        assert(seen.size() == refs.size());
+        return pos->second;
+    }
+    const State &idx_to_state(uint32_t idx) const {
+        assert(idx < refs.size());
+        return refs[idx]->first;
+    }
+};
+StateRepo::~StateRepo() = default;
+
+template <uint8_t N>
+std::vector<bool> expand_bits(uint32_t value) {
+    static_assert(N < 10);
+    std::vector<bool> result(window_size<N>());
+    uint32_t look_for = num_transitions<N>();
+    assert(value < look_for);
+    for (size_t i = 0; i < result.size(); ++i) {
+        look_for >>= 1;
+        result[i] = (value & look_for);
+    }
+    return result;
+}
+
+template <uint8_t N>
+[[maybe_unused]] StateRepo make_state_repo() {
+    StateRepo repo;
+    for (uint32_t idx = 0; idx < repo.size(); ++idx) {
+        const State &state = repo.idx_to_state(idx);
+        for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+            State new_state = state.next<N>(expand_bits<N>(i));
+            (void) new_state.normalize();
+            (void) repo.state_to_idx(new_state);
+        }
+    }
+    return repo;
+}
+
+// this is the result of our efforts
+template <uint8_t N>
+struct Tfa {
+    struct Entry {
+        uint8_t step;
+        uint8_t state;
+    };
+
+    // what happens when following a transition from a state?
+    std::array<std::array<Entry,num_transitions<N>()>,num_states<N>()> table;
+
+    // how many edits did we use to match the target word?
+    std::array<std::array<uint8_t,window_size<N>()>,num_states<N>()> edits;
+};
+
+template <uint8_t N>
+std::unique_ptr<Tfa<N>> make_tfa() {
+    auto tfa = std::make_unique<Tfa<N>>();
+    StateRepo repo;
+    uint32_t state_idx = 0;
+    for (; state_idx < repo.size(); ++state_idx) {
+        const State &state = repo.idx_to_state(state_idx);
+        for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+            State new_state = state.next<N>(expand_bits<N>(i));
+            uint32_t step = new_state.normalize();
+            uint32_t new_state_idx = repo.state_to_idx(new_state);
+            assert(step < 256);
+            assert(new_state_idx < 256);
+            tfa->table[state_idx][i].step = step;
+            tfa->table[state_idx][i].state = new_state_idx;
+        }
+        auto edits = state.make_edit_vector<N>();
+        assert(edits.size() == window_size<N>());
+        for (uint32_t i = 0; i < window_size<N>(); ++i) {
+            tfa->edits[state_idx][i] = edits[i];
+        }
+    }
+    assert(repo.size() == num_states<N>());
+    assert(state_idx == num_states<N>());
+    return tfa;
+}
+
+template <uint8_t N>
+const Tfa<N> *get_tfa() {
+    static std::unique_ptr<Tfa<N>> tfa = make_tfa<N>();
+    return tfa.get();
+}
+
+template <typename T>
+vespalib::string format_vector(const std::vector<T> &vector, bool compact = false) {
+    vespalib::string str = compact ? "" : "[";
+    for (size_t i = 0; i < vector.size(); ++i) {
+        if (i > 0 && !compact) {
+            str.append(",");
+        }
+        str.append(fmt("%u", uint32_t(vector[i])));
+    }
+    if (!compact) {
+        str.append("]");
+    }
+    return str;
+}
+
+template <uint8_t N>
+struct TableMatcher {
+    struct S {
+        uint32_t index;
+        uint32_t state;
+        // needed by dfa matcher concept (should use std::declval instead)
+        constexpr S() noexcept : index(0), state(0) {}
+        constexpr S(uint32_t i, uint32_t s) noexcept : index(i), state(s) {}
+        S next(const Tfa<N> *tfa, uint32_t bits) noexcept {
+            auto entry = tfa->table[state][bits];
+            return S(index + entry.step, entry.state);
+        }
+        constexpr bool is_valid_edge(const Tfa<N> *tfa, uint32_t bits) const noexcept {
+            return tfa->table[state][bits].state != 0;
+        }
+    };
+    using StateType = S;
+    using StateParamType = StateType;
+    using EdgeType = uint32_t;
+
+    const Tfa<N>              *tfa;
+    const TableDfa<N>::Lookup *lookup;
+    const uint32_t             end;
+    const bool                 cased;
+
+    TableMatcher(const Tfa<N> *tfa_in, const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in)
+      noexcept : tfa(tfa_in), lookup(lookup_in), end(end_in), cased(cased_in) {}
+    
+    bool is_cased() const noexcept { return cased; }
+    static constexpr S start() noexcept { return S(0, 1); }
+
+    uint8_t match_edit_distance(S s) const noexcept {
+        uint32_t leap = (end - s.index);
+        return (leap < window_size<N>()) ? tfa->edits[s.state][leap] : N + 1;
+    }
+    bool is_match(S s) const noexcept { return match_edit_distance(s) <= N; }
+
+    static constexpr bool can_match(S s) noexcept { return (s.state != 0); }
+    static constexpr bool valid_state(S s) noexcept { return (s.state != 0); }
+
+    S match_wildcard(S s) const noexcept { return s.next(tfa, 0); }
+    S match_input(S s, uint32_t c) const noexcept {
+        const auto *slice = lookup[s.index].list.data();
+        for (size_t i = 0; i < window_size<N>() && slice[i].input != 0; ++i) {
+            if (slice[i].input == c) {
+                return s.next(tfa, slice[i].match);
+            }
+        }
+        return match_wildcard(s);
+    }
+
+    bool has_higher_out_edge(S s, uint32_t c) const noexcept {
+        if (s.is_valid_edge(tfa, 0)) {
+            return true;
+        }
+        const auto *slice = lookup[s.index].list.data();
+        for (size_t i = 0; i < window_size<N>() && slice[i].input > c; ++i) {
+            if (s.is_valid_edge(tfa, slice[i].match)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool has_exact_explicit_out_edge(S s, uint32_t c) const noexcept {
+        const auto *slice = lookup[s.index].list.data();
+        for (size_t i = 0; i < window_size<N>() && slice[i].input >= c; ++i) {
+            if (slice[i].input == c) {
+                return s.is_valid_edge(tfa, slice[i].match);
+            }
+        }
+        return false;
+    }
+
+    uint32_t lowest_higher_explicit_out_edge(S s, uint32_t c) const noexcept {        
+        const auto *slice = lookup[s.index].list.data();
+        size_t i = window_size<N>();
+        while (i-- > 0) {
+            if (slice[i].input > c && s.is_valid_edge(tfa, slice[i].match)) {
+                return slice[i].input;
+            }
+        }
+        return 0;
+    }
+    
+    uint32_t smallest_explicit_out_edge(S s) const noexcept {
+        const auto *slice = lookup[s.index].list.data();
+        size_t i = window_size<N>();
+        while (i-- > 0) {
+            if (slice[i].input != 0 && s.is_valid_edge(tfa, slice[i].match)) {
+                return slice[i].input;
+            }
+        }
+        return 0;
+    }
+    
+    static constexpr bool valid_edge(uint32_t c) noexcept { return c != 0; }
+    static constexpr uint32_t edge_to_u32char(uint32_t c) noexcept { return c; }
+    S edge_to_state(S s, uint32_t c) const noexcept { return match_input(s, c); }
+
+    static constexpr bool implies_exact_match_suffix(S) noexcept { return false; }
+    static constexpr void emit_exact_match_suffix(S, std::vector<uint32_t> &) {} // not called
+    static constexpr void emit_exact_match_suffix(S, std::string &) {} // not called
+};
+
+} // unnamed
+
+template <uint8_t N>
+auto
+TableDfa<N>::make_lookup(const std::vector<uint32_t> &str)->std::vector<Lookup>
+{
+    std::vector<Lookup> result(str.size() + 1);
+    auto have_already = [&](uint32_t c, size_t i)noexcept{
+                            for (size_t j = 0; j < window_size(); ++j) {
+                                if (result[i].list[j].input == c) {
+                                    return true;
+                                }
+                            }
+                            return false;
+                        };
+    auto make_vector = [&](uint32_t c, size_t i)noexcept{
+                           uint32_t bits = 0;
+                           for (size_t j = 0; j < window_size(); ++j) {
+                               bool found = ((i + j) < str.size()) && (str[i+j] == c);
+                               bits = (bits << 1) + found;
+                           }
+                           return bits;
+                       };
+    for (size_t i = 0; i < str.size(); ++i) {
+        for (size_t j = 0; j < window_size(); ++j) {
+            assert(result[i].list[j].input == 0);
+            assert(result[i].list[j].match == 0);
+            if ((i + j) < str.size()) {
+                uint32_t c = str[i + j];
+                if (!have_already(c, i)) {
+                    result[i].list[j].input = c;
+                    result[i].list[j].match = make_vector(c, i);
+                }
+            }
+        }
+        std::sort(result[i].list.begin(), result[i].list.end(),
+                  [](const auto &a, const auto &b){ return a.input > b.input; });
+    }
+    return result;
+}
+
+template <uint8_t N>
+TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased)
+  : _tfa(get_tfa<N>()),
+    _lookup(make_lookup(str)),
+    _is_cased(is_cased)
+{
+}
+
+template <uint8_t N>
+TableDfa<N>::~TableDfa() = default;
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str) const
+{
+    TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased);
+    return MatchAlgorithm<N>::match(matcher, u8str);
+}
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str, std::string& successor_out) const
+{
+    TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased);
+    return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
+}
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const
+{
+    TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased);
+    return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
+}
+
+template <uint8_t N>
+size_t
+TableDfa<N>::memory_usage() const noexcept
+{
+    return _lookup.size() * sizeof(Lookup);
+}
+
+template <uint8_t N>
+void
+TableDfa<N>::dump_as_graphviz(std::ostream &os) const
+{
+    os << std::dec << "digraph table_dfa {\n";
+    for (size_t i = 0; i < _lookup.size(); ++i) {
+        for (size_t j = 0; j < window_size(); ++j) {
+            if (_lookup[i].list[j].input != 0) {
+                std::string as_utf8;
+                append_utf32_char(as_utf8, _lookup[i].list[j].input);
+                os << "    x" << i << " -> " << _lookup[i].list[j].match << " [label=\"" << as_utf8 << "\"];\n";
+            }
+        }
+        os << "    x" << i << " -> 0 [label=\"*\"];\n";
+    }
+    os << "}\n";
+}
+
+}
author	Håvard Pettersen <havardpe@yahooinc.com>	2023-09-18 15:04:26 +0000
committer	Håvard Pettersen <havardpe@yahooinc.com>	2023-09-25 14:54:38 +0000
commit	985306887f246dafaeb0315ef3c3331b0e33e114 (patch)
tree	e03de7b856a4ee91e8e18dd786b0788b3da1c75b /vespalib
parent	7facdd6177063f772c497000b9c12e4653a2db83 (diff)