aboutsummaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@yahooinc.com>2023-09-18 15:04:26 +0000
committerHåvard Pettersen <havardpe@yahooinc.com>2023-09-25 14:54:38 +0000
commit985306887f246dafaeb0315ef3c3331b0e33e114 (patch)
treee03de7b856a4ee91e8e18dd786b0788b3da1c75b /vespalib
parent7facdd6177063f772c497000b9c12e4653a2db83 (diff)
table dfa
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/CMakeLists.txt1
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp9
-rw-r--r--vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt9
-rw-r--r--vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp298
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt1
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp16
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h3
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp10
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/table_dfa.h64
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp497
10 files changed, 900 insertions, 8 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index c1d7e17b457..56dcd9abdf6 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -96,6 +96,7 @@ vespa_define_module(
src/tests/fileheader
src/tests/floatingpointtype
src/tests/fuzzy
+ src/tests/fuzzy/table_dfa
src/tests/gencnt
src/tests/growablebytebuffer
src/tests/guard
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index c235cb99509..02df6edc370 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -82,7 +82,8 @@ INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes,
Combine(Values(LevenshteinDfa::Casing::Uncased,
LevenshteinDfa::Casing::Cased),
Values(LevenshteinDfa::DfaType::Explicit,
- LevenshteinDfa::DfaType::Implicit)),
+ LevenshteinDfa::DfaType::Implicit,
+ LevenshteinDfa::DfaType::Table)),
LevenshteinDfaTest::stringify_params);
// Same as existing non-DFA Levenshtein tests, but with some added instantiations
@@ -233,7 +234,8 @@ struct LevenshteinDfaCasingTest : TestWithParam<LevenshteinDfa::DfaType> {
INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
LevenshteinDfaCasingTest,
Values(LevenshteinDfa::DfaType::Explicit,
- LevenshteinDfa::DfaType::Implicit),
+ LevenshteinDfa::DfaType::Implicit,
+ LevenshteinDfa::DfaType::Table),
PrintToStringParamName());
TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) {
@@ -315,7 +317,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
Combine(Values(LevenshteinDfa::Casing::Uncased,
LevenshteinDfa::Casing::Cased),
Values(LevenshteinDfa::DfaType::Explicit,
- LevenshteinDfa::DfaType::Implicit),
+ LevenshteinDfa::DfaType::Implicit,
+ LevenshteinDfa::DfaType::Table),
Values(1, 2)),
LevenshteinDfaSuccessorTest::stringify_params);
diff --git a/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt
new file mode 100644
index 00000000000..1017ac99564
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vespalib_fuzzy_table_dfa_test_app TEST
+ SOURCES
+ table_dfa_test.cpp
+ DEPENDS
+ vespalib
+ GTest::GTest
+ )
+vespa_add_test(NAME vespalib_fuzzy_table_dfa_test_app COMMAND vespalib_fuzzy_table_dfa_test_app)
diff --git a/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
new file mode 100644
index 00000000000..7782a39c3c7
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
@@ -0,0 +1,298 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/fuzzy/table_dfa.hpp>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <set>
+
+using namespace ::testing;
+using namespace vespalib::fuzzy;
+
+// test/experiment with low-level concepts underlying the construction
+// of the tables used in the table-driven dfa implementation.
+
+TEST(TableDfaTest, position) {
+ Position pos1 = Position::start();
+ EXPECT_EQ(pos1.index, 0);
+ EXPECT_EQ(pos1.edits, 0);
+ Position pos2(2, 3);
+ EXPECT_EQ(pos2.index, 2);
+ EXPECT_EQ(pos2.edits, 3);
+}
+
+TEST(TableDfaTest, position_equality) {
+ Position pos1(0, 0);
+ Position pos2(0, 1);
+ Position pos3(1, 0);
+ EXPECT_TRUE(pos1 == pos1);
+ EXPECT_FALSE(pos1 == pos2);
+ EXPECT_FALSE(pos1 == pos2);
+}
+
+TEST(TableDfaTest, position_sort_order) {
+ std::vector<Position> list;
+ list.emplace_back(0,1);
+ list.emplace_back(0,0);
+ list.emplace_back(1,0);
+ list.emplace_back(1,1);
+ std::sort(list.begin(), list.end());
+ EXPECT_EQ(list[0].index, 0);
+ EXPECT_EQ(list[0].edits, 0);
+ EXPECT_EQ(list[1].index, 1);
+ EXPECT_EQ(list[1].edits, 0);
+ EXPECT_EQ(list[2].index, 0);
+ EXPECT_EQ(list[2].edits, 1);
+ EXPECT_EQ(list[3].index, 1);
+ EXPECT_EQ(list[3].edits, 1);
+}
+
+TEST(TableDfaTest, position_subsumption) {
+ Position pos1(0, 0);
+ Position pos2(0, 1);
+ Position pos3(0, 2);
+
+ Position pos4(1, 0);
+ Position pos5(1, 1);
+ Position pos6(1, 2);
+
+ Position pos7(2, 0);
+ Position pos8(2, 1);
+ Position pos9(2, 2);
+
+ EXPECT_FALSE(pos1.subsumes(pos1));
+ EXPECT_TRUE(pos1.subsumes(pos2));
+ EXPECT_TRUE(pos1.subsumes(pos3));
+ EXPECT_FALSE(pos1.subsumes(pos4));
+ EXPECT_TRUE(pos1.subsumes(pos5));
+ EXPECT_TRUE(pos1.subsumes(pos6));
+ EXPECT_FALSE(pos1.subsumes(pos7));
+ EXPECT_FALSE(pos1.subsumes(pos8));
+ EXPECT_TRUE(pos1.subsumes(pos9));
+
+ EXPECT_FALSE(pos5.subsumes(pos1));
+ EXPECT_FALSE(pos5.subsumes(pos2));
+ EXPECT_TRUE(pos5.subsumes(pos3));
+ EXPECT_FALSE(pos5.subsumes(pos4));
+ EXPECT_FALSE(pos5.subsumes(pos5));
+ EXPECT_TRUE(pos5.subsumes(pos6));
+ EXPECT_FALSE(pos5.subsumes(pos7));
+ EXPECT_FALSE(pos5.subsumes(pos8));
+ EXPECT_TRUE(pos5.subsumes(pos9));
+}
+
+TEST(TableDfaTest, position_materialization) {
+ EXPECT_EQ(Position(1,1).materialize(0).index, 0);
+ EXPECT_EQ(Position(1,1).materialize(1).index, 1);
+ EXPECT_EQ(Position(1,1).materialize(2).index, 2);
+ EXPECT_EQ(Position(1,1).materialize(0).edits, 2);
+ EXPECT_EQ(Position(1,1).materialize(1).edits, 1);
+ EXPECT_EQ(Position(1,1).materialize(2).edits, 2);
+}
+
+TEST(TableDfaTest, position_to_string) {
+ Position pos1(0, 0);
+ Position pos2(1, 2);
+ Position pos3(2, 3);
+ EXPECT_EQ(pos1.to_string(), fmt("0#0"));
+ EXPECT_EQ(pos2.to_string(), fmt("1#2"));
+ EXPECT_EQ(pos3.to_string(), fmt("2#3"));
+}
+
+TEST(TableDfaTest, state_creation_reorder) {
+ EXPECT_EQ(State::create<5>({{0,1},{2,0}}).to_string(), fmt("{2#0,0#1}"));
+ EXPECT_EQ(State::create<5>({{2,0},{0,0}}).to_string(), fmt("{0#0,2#0}"));
+}
+
+TEST(TableDfaTest, state_creation_duplicate_removal) {
+ EXPECT_EQ(State::create<5>({{0,0},{0,0},{2,1},{2,1}}).to_string(), fmt("{0#0,2#1}"));
+}
+
+TEST(TableDfaTest, state_creation_edit_cutoff) {
+ EXPECT_EQ(State::create<2>({{0,0},{5,2},{10,3}}).to_string(), fmt("{0#0,5#2}"));
+}
+
+TEST(TableDfaTest, state_creation_subsumption_collapsing) {
+ EXPECT_EQ(State::create<2>({{0,0},{1,1}}).to_string(), fmt("{0#0}"));
+ EXPECT_EQ(State::create<2>({{0,1},{1,0}}).to_string(), fmt("{1#0}"));
+ EXPECT_EQ(State::create<2>({{0,0},{2,2}}).to_string(), fmt("{0#0}"));
+ EXPECT_EQ(State::create<2>({{0,2},{2,0}}).to_string(), fmt("{2#0}"));
+}
+
+TEST(TableDfaTest, state_normalization) {
+ auto state1 = State::create<2>({{2,1},{3,1}});
+ auto state2 = State::create<2>({{5,0},{3,1}});
+ EXPECT_EQ(state1.to_string(), fmt("{2#1,3#1}"));
+ EXPECT_EQ(state2.to_string(), fmt("{5#0,3#1}"));
+ EXPECT_EQ(state1.normalize(), 2);
+ EXPECT_EQ(state2.normalize(), 3);
+ EXPECT_EQ(state1.to_string(), fmt("{0#1,1#1}"));
+ EXPECT_EQ(state2.to_string(), fmt("{2#0,0#1}"));
+}
+
+TEST(TableDfaTest, state_repo) {
+ StateRepo repo;
+ EXPECT_EQ(repo.state_to_idx(State::failed()), 0);
+ EXPECT_EQ(repo.state_to_idx(State::start()), 1);
+ EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2);
+ EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3);
+ EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2);
+ EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3);
+ EXPECT_EQ(repo.size(), 4);
+ EXPECT_EQ(repo.idx_to_state(0).to_string(), fmt("{}"));
+ EXPECT_EQ(repo.idx_to_state(1).to_string(), fmt("{0#0}"));
+ EXPECT_EQ(repo.idx_to_state(2).to_string(), fmt("{0#0,1#0}"));
+ EXPECT_EQ(repo.idx_to_state(3).to_string(), fmt("{0#0,2#1}"));
+}
+
+TEST(TableDfaTest, expand_bits) {
+ auto yes = expand_bits<2>(0x1f);
+ auto no = expand_bits<2>(0x00);
+ auto odd = expand_bits<2>(0x0a);
+ auto even = expand_bits<2>(0x15);
+ ASSERT_EQ(yes.size(), 5);
+ ASSERT_EQ(no.size(), 5);
+ ASSERT_EQ(odd.size(), 5);
+ ASSERT_EQ(even.size(), 5);
+ for (size_t i = 0; i < 5; ++i) {
+ EXPECT_TRUE(yes[i]);
+ EXPECT_FALSE(no[i]);
+ EXPECT_EQ(odd[i], bool(i % 2 == 1));
+ EXPECT_EQ(even[i], bool(i % 2 == 0));
+ }
+}
+
+TEST(TableDfaTest, format_bits) {
+ EXPECT_EQ(format_vector(expand_bits<1>(0)), fmt("[0,0,0]"));
+ EXPECT_EQ(format_vector(expand_bits<1>(7)), fmt("[1,1,1]"));
+ EXPECT_EQ(format_vector(expand_bits<1>(5)), fmt("[1,0,1]"));
+ EXPECT_EQ(format_vector(expand_bits<1>(2)), fmt("[0,1,0]"));
+ EXPECT_EQ(format_vector(expand_bits<2>(31)), fmt("[1,1,1,1,1]"));
+ EXPECT_EQ(format_vector(expand_bits<2>(21)), fmt("[1,0,1,0,1]"));
+ EXPECT_EQ(format_vector(expand_bits<2>(31), true), fmt("11111"));
+ EXPECT_EQ(format_vector(expand_bits<2>(21), true), fmt("10101"));
+}
+
+template <uint8_t N>
+void list_states(bool count_only = false) {
+ auto repo = make_state_repo<N>();
+ EXPECT_EQ(num_states<N>(), repo.size());
+ fprintf(stderr, "max_edits: %u, number of states: %zu\n", N, repo.size());
+ if (!count_only) {
+ for (uint32_t i = 0; i < repo.size(); ++i) {
+ fprintf(stderr, " state %u: %s\n", i, repo.idx_to_state(i).to_string().c_str());
+ }
+ }
+}
+
+TEST(TableDfaTest, list_states_for_max_edits_1) { list_states<1>(); }
+TEST(TableDfaTest, list_states_for_max_edits_2) { list_states<2>(); }
+TEST(TableDfaTest, count_states_for_max_edits_3) { list_states<3>(true); }
+
+template <uint8_t N>
+void list_edits() {
+ auto repo = make_state_repo<N>();
+ fprintf(stderr,
+ "per state, listing the minimal number of edits needed\n"
+ "to reach offsets at and beyond its minimal boundary\n");
+ for (uint32_t i = 0; i < repo.size(); ++i) {
+ const State &state = repo.idx_to_state(i);
+ fprintf(stderr, "%-23s : %s\n", state.to_string().c_str(),
+ format_vector(state.make_edit_vector<N>()).c_str());
+ }
+}
+
+TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_1) { list_edits<1>(); }
+TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_2) { list_edits<2>(); }
+
+template <uint8_t N>
+void list_transitions() {
+ auto repo = make_state_repo<N>();
+ for (uint32_t idx = 0; idx < repo.size(); ++idx) {
+ const State &state = repo.idx_to_state(idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ auto bits = expand_bits<N>(i);
+ State new_state = state.next<N>(bits);
+ uint32_t step = new_state.normalize();
+ uint32_t new_idx = repo.state_to_idx(new_state);
+ ASSERT_LT(new_idx, repo.size());
+ fprintf(stderr, "%u:%s,i --%s--> %u:%s,%s\n", idx, state.to_string().c_str(),
+ format_vector(bits).c_str(), new_idx, new_state.to_string().c_str(),
+ (step == 0) ? "i" : fmt("i+%u", step).c_str());
+ }
+ }
+}
+
+TEST(TableDfaTest, list_transitions_for_max_edits_1) { list_transitions<1>(); }
+
+// Simulate all possible ways we can approach the end of the word we
+// are matching. Verify that no transition taken can produce a state
+// with a minimal boundary that exceeds the boundary of the word
+// itself. Verifying this will enable us to not care about word size
+// while simulating the dfa.
+template <uint8_t N>
+void verify_word_end_boundary() {
+ auto repo = make_state_repo<N>();
+ using StateSet = std::set<uint32_t>;
+ std::vector<StateSet> active(window_size<N>() + 1);
+ for (size_t i = 1; i < repo.size(); ++i) {
+ active[0].insert(i);
+ }
+ EXPECT_EQ(active.size(), window_size<N>() + 1);
+ EXPECT_EQ(active[0].size(), repo.size() - 1);
+ fprintf(stderr, "verifying word end for max edits %u\n", N);
+ uint32_t edge_shape = 0;
+ for (uint32_t active_idx = 0; active_idx < active.size(); ++active_idx) {
+ fprintf(stderr, " edge shape: %s, max step: %zu, active_states: %zu\n",
+ format_vector(expand_bits<N>(edge_shape)).c_str(), active.size() - active_idx - 1, active[active_idx].size());
+ for (uint32_t idx: active[active_idx]) {
+ const State &state = repo.idx_to_state(idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ if ((i & edge_shape) == 0) {
+ State new_state = state.next<N>(expand_bits<N>(i));
+ uint32_t step = new_state.normalize();
+ uint32_t new_idx = repo.state_to_idx(new_state);
+ ASSERT_LT(new_idx, repo.size());
+ if (new_idx != 0) {
+ ASSERT_GT(active.size(), active_idx + step);
+ active[active_idx + step].insert(new_idx);
+ }
+ }
+ }
+ }
+ edge_shape = (edge_shape << 1) + 1;
+ }
+ EXPECT_EQ(edge_shape, (1 << (window_size<N>() + 1)) - 1);
+ while (!active.back().empty()) {
+ fprintf(stderr, " residue states after word end: %zu\n", active.back().size());
+ StateSet residue;
+ for (uint32_t idx: active.back()) {
+ const State &state = repo.idx_to_state(idx);
+ State new_state = state.next<N>(expand_bits<N>(0));
+ uint32_t step = new_state.normalize();
+ uint32_t new_idx = repo.state_to_idx(new_state);
+ ASSERT_LT(new_idx, repo.size());
+ ASSERT_EQ(step, 0);
+ if (new_idx != 0) {
+ residue.insert(new_idx);
+ }
+ }
+ active.back() = std::move(residue);
+ }
+}
+
+TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_1) {
+ verify_word_end_boundary<1>();
+}
+
+TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_2) {
+ verify_word_end_boundary<2>();
+}
+
+TEST(TableDfaTest, graphviz_for_food_with_max_edits_1) {
+ auto dfa = LevenshteinDfa::build("food", 1, LevenshteinDfa::Casing::Cased, LevenshteinDfa::DfaType::Table);
+ std::ostringstream out;
+ dfa.dump_as_graphviz(out);
+ fprintf(stderr, "memory usage: %zu\n", dfa.memory_usage());
+ fprintf(stderr, "%s", out.str().c_str());
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
index 5e8d29980cd..8ccef84d969 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(vespalib_vespalib_fuzzy OBJECT
implicit_levenshtein_dfa.cpp
levenshtein_dfa.cpp
levenshtein_distance.cpp
+ table_dfa.cpp
unicode_utils.cpp
DEPENDS
)
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index 1caae408176..5f6d0ae9956 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "explicit_levenshtein_dfa.h"
#include "implicit_levenshtein_dfa.h"
+#include "table_dfa.h"
#include "levenshtein_dfa.h"
#include "unicode_utils.h"
#include <vespa/vespalib/util/stringfmt.h>
@@ -53,14 +54,19 @@ LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max
} else { // max_edits == 2
return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased));
}
- } else { // DfaType::Explicit
+ } else if(dfa_type == DfaType::Explicit) {
if (max_edits == 1) {
return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa();
} else { // max_edits == 2
return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa();
}
+ } else { // DfaType::Table
+ if (max_edits == 1) {
+ return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased));
+ } else { // max_edits == 2
+ return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased));
+ }
}
-
}
LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) {
@@ -87,9 +93,11 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mo
std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) {
if (dt == LevenshteinDfa::DfaType::Implicit) {
os << "Implicit";
- } else {
- assert(dt == LevenshteinDfa::DfaType::Explicit);
+ } else if (dt == LevenshteinDfa::DfaType::Explicit) {
os << "Explicit";
+ } else {
+ assert(dt == LevenshteinDfa::DfaType::Table);
+ os << "Table";
}
return os;
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index c6ca06d4de3..88261bef62c 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -231,7 +231,8 @@ public:
enum class DfaType {
Implicit,
- Explicit
+ Explicit,
+ Table
};
/**
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp
new file mode 100644
index 00000000000..943349818fb
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp
@@ -0,0 +1,10 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "table_dfa.hpp"
+
+namespace vespalib::fuzzy {
+
+template class TableDfa<1>;
+template class TableDfa<2>;
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
new file mode 100644
index 00000000000..dc511a42d04
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
@@ -0,0 +1,64 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "levenshtein_dfa.h"
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+/**
+ * This implementation is based on the paper 'Fast string correction
+ * with Levenshtein automata' from 2002 by Klaus U. Schulz and Stoyan
+ * Mihov.
+ *
+ * Given the maximal distance N, a generic parameterized transition
+ * table is calculated up-front. When a specific word is given, a
+ * simple lookup structure is created to enumerate the possible
+ * characteristic vectors for each position in the given
+ * word. Together, these structures can be used to simulate the
+ * traversal of a hypothetical Levenshtein dfa that will never be
+ * created.
+ *
+ * Approaching the end of the word is handled by padding the
+ * characteristic vectors with 0 bits for everything after the word
+ * ends. In addition, a unit test verifies that there is no possible
+ * sequence of events that leads to the minimal boundary of the state
+ * exceeding the boundary of the word itself. This means that the
+ * simulated dfa can be stepped freely without checking for word size.
+ **/
+template <uint8_t N>
+class TableDfa final : public LevenshteinDfa::Impl
+{
+public:
+ // characteristic vector for a specific input value indicating how
+ // it matches the window starting at the minimal boundary.
+ struct CV {
+ uint32_t input;
+ uint32_t match;
+ CV() noexcept : input(0), match(0) {}
+ };
+ static constexpr size_t window_size() { return 2 * N + 1; }
+ struct Lookup {
+ std::array<CV, window_size()> list;
+ Lookup() noexcept : list() {}
+ };
+
+private:
+ const void *_tfa;
+ const std::vector<Lookup> _lookup;
+ const bool _is_cased;
+
+ static std::vector<Lookup> make_lookup(const std::vector<uint32_t> &str);
+
+public:
+ using MatchResult = LevenshteinDfa::MatchResult;
+ TableDfa(std::vector<uint32_t> str, bool is_cased);
+ ~TableDfa() override;
+ [[nodiscard]] MatchResult match(std::string_view source) const override;
+ [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override;
+ [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const override;
+ [[nodiscard]] size_t memory_usage() const noexcept override;
+ void dump_as_graphviz(std::ostream& os) const override;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
new file mode 100644
index 00000000000..0693343007e
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
@@ -0,0 +1,497 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "table_dfa.h"
+#include "match_algorithm.hpp"
+#include <vespa/vespalib/util/stringfmt.h>
+#include <cassert>
+#include <stdexcept>
+#include <algorithm>
+#include <map>
+
+namespace vespalib::fuzzy {
+
+namespace {
+
+using vespalib::make_string_short::fmt;
+
+// It is useful to know the number of states compile time to be able
+// to pack lookup tables better.
+template <uint8_t N> constexpr size_t num_states();
+template <> constexpr size_t num_states<1>() { return 6; }
+template <> constexpr size_t num_states<2>() { return 31; }
+template <> constexpr size_t num_states<3>() { return 197; }
+
+template <uint8_t N> constexpr size_t window_size() { return 2 * N + 1; }
+template <uint8_t N> constexpr size_t num_transitions() { return 1 << window_size<N>(); }
+
+
+auto diff(auto a, auto b) { return (a > b) ? (a - b) : (b - a); }
+
+// A Position combines an index into a word being matched with the
+// number of edits needed to get there. This maps directly onto a
+// specific state in the NFA used to match a word. Note that the sort
+// order prefers low edits over low indexs. This is to ensure that a
+// position that subsumes another position will always sort before it.
+struct Position {
+ uint32_t index;
+ uint32_t edits;
+ Position(uint32_t index_in, uint32_t edits_in) noexcept
+ : index(index_in), edits(edits_in) {}
+ static Position start() noexcept { return Position(0,0); }
+ bool subsumes(const Position &rhs) const noexcept {
+ if (edits >= rhs.edits) {
+ return false;
+ }
+ return diff(index, rhs.index) <= (rhs.edits - edits);
+ }
+ Position materialize(uint32_t target_index) const noexcept {
+ return Position(target_index, edits + diff(index, target_index));
+ }
+ bool operator==(const Position &rhs) const noexcept {
+ return (index == rhs.index) && (edits == rhs.edits);
+ }
+ bool operator<(const Position &rhs) const noexcept {
+ return std::tie(edits,index) < std::tie(rhs.edits, rhs.index);
+ }
+ template <uint8_t N>
+ void add_elementary_transitions(const std::vector<bool> &bits, std::vector<Position> &dst) const {
+ assert(bits.size() > index);
+ if (!bits[index]) {
+ dst.emplace_back(index, edits + 1);
+ dst.emplace_back(index + 1, edits + 1);
+ }
+ for (uint32_t e = 0; (edits + e) <= N; ++e) {
+ assert(bits.size() > (index + e));
+ if (bits[index + e]) {
+ dst.emplace_back(index + e + 1, edits + e);
+ }
+ }
+ }
+ vespalib::string to_string() const { return fmt("%u#%u", index, edits); }
+};
+
+// A State is a collection of different Positions that do not subsume
+// each other. If the minimal boundary of a state is larger than 0, it
+// can be lifted from the state in a normalizing operation that will
+// renumber the position indexes such that the minimal boundary of the
+// state becomes 0. This is to allow parameterized states where the
+// general progress of matching the string (minimal boundary of
+// non-normalized state) is untangled from the local competing edit
+// alternatives (normalized state).
+struct State {
+ std::vector<Position> list;
+ State() noexcept : list() {}
+ static State failed() noexcept { return State(); }
+ static State start() {
+ State result;
+ result.list.push_back(Position::start());
+ return result;
+ }
+ bool operator<(const State &rhs) const {
+ return list < rhs.list;
+ }
+ uint32_t minimal_boundary() const noexcept {
+ if (list.empty()) {
+ return 0;
+ }
+ uint32_t min = list[0].index;
+ for (size_t i = 1; i < list.size(); ++i) {
+ min = std::min(min, list[i].index);
+ }
+ return min;
+ }
+ uint32_t normalize() {
+ uint32_t min = minimal_boundary();
+ if (min > 0) {
+ for (auto &entry: list) {
+ entry.index -= min;
+ }
+ }
+ return min;
+ }
+ template <uint8_t N>
+ static State create(std::vector<Position> list_in) {
+ State result;
+ auto want = [&result](Position pos) {
+ if (pos.edits > N) {
+ return false;
+ }
+ for (const auto &old_pos: result.list) {
+ if (old_pos == pos || old_pos.subsumes(pos)) {
+ return false;
+ }
+ }
+ return true;
+ };
+ std::sort(list_in.begin(), list_in.end());
+ for (const auto &pos: list_in) {
+ if (want(pos)) {
+ result.list.push_back(pos);
+ }
+ }
+ return result;
+ }
+ template <uint8_t N>
+ State next(const std::vector<bool> &bits) const {
+ std::vector<Position> tmp;
+ for (const auto &pos: list) {
+ pos.add_elementary_transitions<N>(bits, tmp);
+ }
+ return create<N>(std::move(tmp));
+ }
+ template <uint8_t N>
+ std::vector<uint8_t> make_edit_vector() const {
+ std::vector<uint8_t> result(window_size<N>(), N + 1);
+ for (const auto &pos: list) {
+ for (uint32_t i = 0; i < window_size<N>(); ++i) {
+ result[i] = std::min(result[i], uint8_t(pos.materialize(i).edits));
+ }
+ }
+ return result;
+ }
+ vespalib::string to_string() const {
+ vespalib::string result = "{";
+ for (size_t i = 0; i < list.size(); ++i) {
+ if (i > 0) {
+ result.append(",");
+ }
+ result.append(list[i].to_string());
+ }
+ result.append("}");
+ return result;
+ }
+};
+
+// Keeps track of unique states, assigning an integer value to each
+// state. Only states with minimal boundary 0 is allowed to be
+// inserted into a state repo. Each repo is seeded with the empty
+// state (0) and the start state (1). An assigned integer value can be
+// mapped back into the originating state.
+struct StateRepo {
+ using Map = std::map<State,uint32_t>;
+ using Ref = Map::iterator;
+ Map seen;
+ std::vector<Ref> refs;
+ StateRepo() noexcept : seen(), refs() {
+ auto failed_idx = state_to_idx(State::failed());
+ auto start_idx = state_to_idx(State::start());
+ assert(failed_idx == 0);
+ assert(start_idx == 1);
+ }
+ ~StateRepo();
+ size_t size() const { return seen.size(); }
+ uint32_t state_to_idx(const State &state) {
+ assert(state.minimal_boundary() == 0);
+ uint32_t next = refs.size();
+ auto [pos, inserted] = seen.emplace(state, next);
+ if (inserted) {
+ refs.push_back(pos);
+ }
+ assert(seen.size() == refs.size());
+ return pos->second;
+ }
+ const State &idx_to_state(uint32_t idx) const {
+ assert(idx < refs.size());
+ return refs[idx]->first;
+ }
+};
+StateRepo::~StateRepo() = default;
+
+template <uint8_t N>
+std::vector<bool> expand_bits(uint32_t value) {
+ static_assert(N < 10);
+ std::vector<bool> result(window_size<N>());
+ uint32_t look_for = num_transitions<N>();
+ assert(value < look_for);
+ for (size_t i = 0; i < result.size(); ++i) {
+ look_for >>= 1;
+ result[i] = (value & look_for);
+ }
+ return result;
+}
+
+template <uint8_t N>
+[[maybe_unused]] StateRepo make_state_repo() {
+ StateRepo repo;
+ for (uint32_t idx = 0; idx < repo.size(); ++idx) {
+ const State &state = repo.idx_to_state(idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ State new_state = state.next<N>(expand_bits<N>(i));
+ (void) new_state.normalize();
+ (void) repo.state_to_idx(new_state);
+ }
+ }
+ return repo;
+}
+
+// this is the result of our efforts
+template <uint8_t N>
+struct Tfa {
+ struct Entry {
+ uint8_t step;
+ uint8_t state;
+ };
+
+ // what happens when following a transition from a state?
+ std::array<std::array<Entry,num_transitions<N>()>,num_states<N>()> table;
+
+ // how many edits did we use to match the target word?
+ std::array<std::array<uint8_t,window_size<N>()>,num_states<N>()> edits;
+};
+
+template <uint8_t N>
+std::unique_ptr<Tfa<N>> make_tfa() {
+ auto tfa = std::make_unique<Tfa<N>>();
+ StateRepo repo;
+ uint32_t state_idx = 0;
+ for (; state_idx < repo.size(); ++state_idx) {
+ const State &state = repo.idx_to_state(state_idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ State new_state = state.next<N>(expand_bits<N>(i));
+ uint32_t step = new_state.normalize();
+ uint32_t new_state_idx = repo.state_to_idx(new_state);
+ assert(step < 256);
+ assert(new_state_idx < 256);
+ tfa->table[state_idx][i].step = step;
+ tfa->table[state_idx][i].state = new_state_idx;
+ }
+ auto edits = state.make_edit_vector<N>();
+ assert(edits.size() == window_size<N>());
+ for (uint32_t i = 0; i < window_size<N>(); ++i) {
+ tfa->edits[state_idx][i] = edits[i];
+ }
+ }
+ assert(repo.size() == num_states<N>());
+ assert(state_idx == num_states<N>());
+ return tfa;
+}
+
+template <uint8_t N>
+const Tfa<N> *get_tfa() {
+ static std::unique_ptr<Tfa<N>> tfa = make_tfa<N>();
+ return tfa.get();
+}
+
+template <typename T>
+vespalib::string format_vector(const std::vector<T> &vector, bool compact = false) {
+ vespalib::string str = compact ? "" : "[";
+ for (size_t i = 0; i < vector.size(); ++i) {
+ if (i > 0 && !compact) {
+ str.append(",");
+ }
+ str.append(fmt("%u", uint32_t(vector[i])));
+ }
+ if (!compact) {
+ str.append("]");
+ }
+ return str;
+}
+
+template <uint8_t N>
+struct TableMatcher {
+ struct S {
+ uint32_t index;
+ uint32_t state;
+ // needed by dfa matcher concept (should use std::declval instead)
+ constexpr S() noexcept : index(0), state(0) {}
+ constexpr S(uint32_t i, uint32_t s) noexcept : index(i), state(s) {}
+ S next(const Tfa<N> *tfa, uint32_t bits) noexcept {
+ auto entry = tfa->table[state][bits];
+ return S(index + entry.step, entry.state);
+ }
+ constexpr bool is_valid_edge(const Tfa<N> *tfa, uint32_t bits) const noexcept {
+ return tfa->table[state][bits].state != 0;
+ }
+ };
+ using StateType = S;
+ using StateParamType = StateType;
+ using EdgeType = uint32_t;
+
+ const Tfa<N> *tfa;
+ const TableDfa<N>::Lookup *lookup;
+ const uint32_t end;
+ const bool cased;
+
+ TableMatcher(const Tfa<N> *tfa_in, const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in)
+ noexcept : tfa(tfa_in), lookup(lookup_in), end(end_in), cased(cased_in) {}
+
+ bool is_cased() const noexcept { return cased; }
+ static constexpr S start() noexcept { return S(0, 1); }
+
+ uint8_t match_edit_distance(S s) const noexcept {
+ uint32_t leap = (end - s.index);
+ return (leap < window_size<N>()) ? tfa->edits[s.state][leap] : N + 1;
+ }
+ bool is_match(S s) const noexcept { return match_edit_distance(s) <= N; }
+
+ static constexpr bool can_match(S s) noexcept { return (s.state != 0); }
+ static constexpr bool valid_state(S s) noexcept { return (s.state != 0); }
+
+ S match_wildcard(S s) const noexcept { return s.next(tfa, 0); }
+ S match_input(S s, uint32_t c) const noexcept {
+ const auto *slice = lookup[s.index].list.data();
+ for (size_t i = 0; i < window_size<N>() && slice[i].input != 0; ++i) {
+ if (slice[i].input == c) {
+ return s.next(tfa, slice[i].match);
+ }
+ }
+ return match_wildcard(s);
+ }
+
+ bool has_higher_out_edge(S s, uint32_t c) const noexcept {
+ if (s.is_valid_edge(tfa, 0)) {
+ return true;
+ }
+ const auto *slice = lookup[s.index].list.data();
+ for (size_t i = 0; i < window_size<N>() && slice[i].input > c; ++i) {
+ if (s.is_valid_edge(tfa, slice[i].match)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool has_exact_explicit_out_edge(S s, uint32_t c) const noexcept {
+ const auto *slice = lookup[s.index].list.data();
+ for (size_t i = 0; i < window_size<N>() && slice[i].input >= c; ++i) {
+ if (slice[i].input == c) {
+ return s.is_valid_edge(tfa, slice[i].match);
+ }
+ }
+ return false;
+ }
+
+ uint32_t lowest_higher_explicit_out_edge(S s, uint32_t c) const noexcept {
+ const auto *slice = lookup[s.index].list.data();
+ size_t i = window_size<N>();
+ while (i-- > 0) {
+ if (slice[i].input > c && s.is_valid_edge(tfa, slice[i].match)) {
+ return slice[i].input;
+ }
+ }
+ return 0;
+ }
+
+ uint32_t smallest_explicit_out_edge(S s) const noexcept {
+ const auto *slice = lookup[s.index].list.data();
+ size_t i = window_size<N>();
+ while (i-- > 0) {
+ if (slice[i].input != 0 && s.is_valid_edge(tfa, slice[i].match)) {
+ return slice[i].input;
+ }
+ }
+ return 0;
+ }
+
+ static constexpr bool valid_edge(uint32_t c) noexcept { return c != 0; }
+ static constexpr uint32_t edge_to_u32char(uint32_t c) noexcept { return c; }
+ S edge_to_state(S s, uint32_t c) const noexcept { return match_input(s, c); }
+
+ static constexpr bool implies_exact_match_suffix(S) noexcept { return false; }
+ static constexpr void emit_exact_match_suffix(S, std::vector<uint32_t> &) {} // not called
+ static constexpr void emit_exact_match_suffix(S, std::string &) {} // not called
+};
+
+} // unnamed
+
+template <uint8_t N>
+auto
+TableDfa<N>::make_lookup(const std::vector<uint32_t> &str)->std::vector<Lookup>
+{
+ std::vector<Lookup> result(str.size() + 1);
+ auto have_already = [&](uint32_t c, size_t i)noexcept{
+ for (size_t j = 0; j < window_size(); ++j) {
+ if (result[i].list[j].input == c) {
+ return true;
+ }
+ }
+ return false;
+ };
+ auto make_vector = [&](uint32_t c, size_t i)noexcept{
+ uint32_t bits = 0;
+ for (size_t j = 0; j < window_size(); ++j) {
+ bool found = ((i + j) < str.size()) && (str[i+j] == c);
+ bits = (bits << 1) + found;
+ }
+ return bits;
+ };
+ for (size_t i = 0; i < str.size(); ++i) {
+ for (size_t j = 0; j < window_size(); ++j) {
+ assert(result[i].list[j].input == 0);
+ assert(result[i].list[j].match == 0);
+ if ((i + j) < str.size()) {
+ uint32_t c = str[i + j];
+ if (!have_already(c, i)) {
+ result[i].list[j].input = c;
+ result[i].list[j].match = make_vector(c, i);
+ }
+ }
+ }
+ std::sort(result[i].list.begin(), result[i].list.end(),
+ [](const auto &a, const auto &b){ return a.input > b.input; });
+ }
+ return result;
+}
+
+template <uint8_t N>
+TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased)
+ : _tfa(get_tfa<N>()),
+ _lookup(make_lookup(str)),
+ _is_cased(is_cased)
+{
+}
+
+template <uint8_t N>
+TableDfa<N>::~TableDfa() = default;
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str) const
+{
+ TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased);
+ return MatchAlgorithm<N>::match(matcher, u8str);
+}
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str, std::string& successor_out) const
+{
+ TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased);
+ return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
+}
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const
+{
+ TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased);
+ return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
+}
+
+template <uint8_t N>
+size_t
+TableDfa<N>::memory_usage() const noexcept
+{
+ return _lookup.size() * sizeof(Lookup);
+}
+
+template <uint8_t N>
+void
+TableDfa<N>::dump_as_graphviz(std::ostream &os) const
+{
+ os << std::dec << "digraph table_dfa {\n";
+ for (size_t i = 0; i < _lookup.size(); ++i) {
+ for (size_t j = 0; j < window_size(); ++j) {
+ if (_lookup[i].list[j].input != 0) {
+ std::string as_utf8;
+ append_utf32_char(as_utf8, _lookup[i].list[j].input);
+ os << " x" << i << " -> " << _lookup[i].list[j].match << " [label=\"" << as_utf8 << "\"];\n";
+ }
+ }
+ os << " x" << i << " -> 0 [label=\"*\"];\n";
+ }
+ os << "}\n";
+}
+
+}