diff options
author | Håvard Pettersen <havardpe@yahooinc.com> | 2023-09-18 15:04:26 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@yahooinc.com> | 2023-09-25 14:54:38 +0000 |
commit | 985306887f246dafaeb0315ef3c3331b0e33e114 (patch) | |
tree | e03de7b856a4ee91e8e18dd786b0788b3da1c75b /vespalib | |
parent | 7facdd6177063f772c497000b9c12e4653a2db83 (diff) |
table dfa
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/CMakeLists.txt | 1 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp | 9 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt | 9 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp | 298 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt | 1 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp | 16 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h | 3 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp | 10 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/table_dfa.h | 64 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp | 497 |
10 files changed, 900 insertions, 8 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index c1d7e17b457..56dcd9abdf6 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -96,6 +96,7 @@ vespa_define_module( src/tests/fileheader src/tests/floatingpointtype src/tests/fuzzy + src/tests/fuzzy/table_dfa src/tests/gencnt src/tests/growablebytebuffer src/tests/guard diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp index c235cb99509..02df6edc370 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -82,7 +82,8 @@ INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes, Combine(Values(LevenshteinDfa::Casing::Uncased, LevenshteinDfa::Casing::Cased), Values(LevenshteinDfa::DfaType::Explicit, - LevenshteinDfa::DfaType::Implicit)), + LevenshteinDfa::DfaType::Implicit, + LevenshteinDfa::DfaType::Table)), LevenshteinDfaTest::stringify_params); // Same as existing non-DFA Levenshtein tests, but with some added instantiations @@ -233,7 +234,8 @@ struct LevenshteinDfaCasingTest : TestWithParam<LevenshteinDfa::DfaType> { INSTANTIATE_TEST_SUITE_P(AllDfaTypes, LevenshteinDfaCasingTest, Values(LevenshteinDfa::DfaType::Explicit, - LevenshteinDfa::DfaType::Implicit), + LevenshteinDfa::DfaType::Implicit, + LevenshteinDfa::DfaType::Table), PrintToStringParamName()); TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) { @@ -315,7 +317,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, Combine(Values(LevenshteinDfa::Casing::Uncased, LevenshteinDfa::Casing::Cased), Values(LevenshteinDfa::DfaType::Explicit, - LevenshteinDfa::DfaType::Implicit), + LevenshteinDfa::DfaType::Implicit, + LevenshteinDfa::DfaType::Table), Values(1, 2)), LevenshteinDfaSuccessorTest::stringify_params); diff --git a/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt new file mode 100644 index 00000000000..1017ac99564 --- /dev/null +++ b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_fuzzy_table_dfa_test_app TEST + SOURCES + table_dfa_test.cpp + DEPENDS + vespalib + GTest::GTest + ) +vespa_add_test(NAME vespalib_fuzzy_table_dfa_test_app COMMAND vespalib_fuzzy_table_dfa_test_app) diff --git a/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp new file mode 100644 index 00000000000..7782a39c3c7 --- /dev/null +++ b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp @@ -0,0 +1,298 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/fuzzy/table_dfa.hpp> +#include <vespa/vespalib/gtest/gtest.h> +#include <set> + +using namespace ::testing; +using namespace vespalib::fuzzy; + +// test/experiment with low-level concepts underlying the construction +// of the tables used in the table-driven dfa implementation. + +TEST(TableDfaTest, position) { + Position pos1 = Position::start(); + EXPECT_EQ(pos1.index, 0); + EXPECT_EQ(pos1.edits, 0); + Position pos2(2, 3); + EXPECT_EQ(pos2.index, 2); + EXPECT_EQ(pos2.edits, 3); +} + +TEST(TableDfaTest, position_equality) { + Position pos1(0, 0); + Position pos2(0, 1); + Position pos3(1, 0); + EXPECT_TRUE(pos1 == pos1); + EXPECT_FALSE(pos1 == pos2); + EXPECT_FALSE(pos1 == pos2); +} + +TEST(TableDfaTest, position_sort_order) { + std::vector<Position> list; + list.emplace_back(0,1); + list.emplace_back(0,0); + list.emplace_back(1,0); + list.emplace_back(1,1); + std::sort(list.begin(), list.end()); + EXPECT_EQ(list[0].index, 0); + EXPECT_EQ(list[0].edits, 0); + EXPECT_EQ(list[1].index, 1); + EXPECT_EQ(list[1].edits, 0); + EXPECT_EQ(list[2].index, 0); + EXPECT_EQ(list[2].edits, 1); + EXPECT_EQ(list[3].index, 1); + EXPECT_EQ(list[3].edits, 1); +} + +TEST(TableDfaTest, position_subsumption) { + Position pos1(0, 0); + Position pos2(0, 1); + Position pos3(0, 2); + + Position pos4(1, 0); + Position pos5(1, 1); + Position pos6(1, 2); + + Position pos7(2, 0); + Position pos8(2, 1); + Position pos9(2, 2); + + EXPECT_FALSE(pos1.subsumes(pos1)); + EXPECT_TRUE(pos1.subsumes(pos2)); + EXPECT_TRUE(pos1.subsumes(pos3)); + EXPECT_FALSE(pos1.subsumes(pos4)); + EXPECT_TRUE(pos1.subsumes(pos5)); + EXPECT_TRUE(pos1.subsumes(pos6)); + EXPECT_FALSE(pos1.subsumes(pos7)); + EXPECT_FALSE(pos1.subsumes(pos8)); + EXPECT_TRUE(pos1.subsumes(pos9)); + + EXPECT_FALSE(pos5.subsumes(pos1)); + EXPECT_FALSE(pos5.subsumes(pos2)); + EXPECT_TRUE(pos5.subsumes(pos3)); + EXPECT_FALSE(pos5.subsumes(pos4)); + EXPECT_FALSE(pos5.subsumes(pos5)); + EXPECT_TRUE(pos5.subsumes(pos6)); + EXPECT_FALSE(pos5.subsumes(pos7)); + EXPECT_FALSE(pos5.subsumes(pos8)); + EXPECT_TRUE(pos5.subsumes(pos9)); +} + +TEST(TableDfaTest, position_materialization) { + EXPECT_EQ(Position(1,1).materialize(0).index, 0); + EXPECT_EQ(Position(1,1).materialize(1).index, 1); + EXPECT_EQ(Position(1,1).materialize(2).index, 2); + EXPECT_EQ(Position(1,1).materialize(0).edits, 2); + EXPECT_EQ(Position(1,1).materialize(1).edits, 1); + EXPECT_EQ(Position(1,1).materialize(2).edits, 2); +} + +TEST(TableDfaTest, position_to_string) { + Position pos1(0, 0); + Position pos2(1, 2); + Position pos3(2, 3); + EXPECT_EQ(pos1.to_string(), fmt("0#0")); + EXPECT_EQ(pos2.to_string(), fmt("1#2")); + EXPECT_EQ(pos3.to_string(), fmt("2#3")); +} + +TEST(TableDfaTest, state_creation_reorder) { + EXPECT_EQ(State::create<5>({{0,1},{2,0}}).to_string(), fmt("{2#0,0#1}")); + EXPECT_EQ(State::create<5>({{2,0},{0,0}}).to_string(), fmt("{0#0,2#0}")); +} + +TEST(TableDfaTest, state_creation_duplicate_removal) { + EXPECT_EQ(State::create<5>({{0,0},{0,0},{2,1},{2,1}}).to_string(), fmt("{0#0,2#1}")); +} + +TEST(TableDfaTest, state_creation_edit_cutoff) { + EXPECT_EQ(State::create<2>({{0,0},{5,2},{10,3}}).to_string(), fmt("{0#0,5#2}")); +} + +TEST(TableDfaTest, state_creation_subsumption_collapsing) { + EXPECT_EQ(State::create<2>({{0,0},{1,1}}).to_string(), fmt("{0#0}")); + EXPECT_EQ(State::create<2>({{0,1},{1,0}}).to_string(), fmt("{1#0}")); + EXPECT_EQ(State::create<2>({{0,0},{2,2}}).to_string(), fmt("{0#0}")); + EXPECT_EQ(State::create<2>({{0,2},{2,0}}).to_string(), fmt("{2#0}")); +} + +TEST(TableDfaTest, state_normalization) { + auto state1 = State::create<2>({{2,1},{3,1}}); + auto state2 = State::create<2>({{5,0},{3,1}}); + EXPECT_EQ(state1.to_string(), fmt("{2#1,3#1}")); + EXPECT_EQ(state2.to_string(), fmt("{5#0,3#1}")); + EXPECT_EQ(state1.normalize(), 2); + EXPECT_EQ(state2.normalize(), 3); + EXPECT_EQ(state1.to_string(), fmt("{0#1,1#1}")); + EXPECT_EQ(state2.to_string(), fmt("{2#0,0#1}")); +} + +TEST(TableDfaTest, state_repo) { + StateRepo repo; + EXPECT_EQ(repo.state_to_idx(State::failed()), 0); + EXPECT_EQ(repo.state_to_idx(State::start()), 1); + EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2); + EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3); + EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2); + EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3); + EXPECT_EQ(repo.size(), 4); + EXPECT_EQ(repo.idx_to_state(0).to_string(), fmt("{}")); + EXPECT_EQ(repo.idx_to_state(1).to_string(), fmt("{0#0}")); + EXPECT_EQ(repo.idx_to_state(2).to_string(), fmt("{0#0,1#0}")); + EXPECT_EQ(repo.idx_to_state(3).to_string(), fmt("{0#0,2#1}")); +} + +TEST(TableDfaTest, expand_bits) { + auto yes = expand_bits<2>(0x1f); + auto no = expand_bits<2>(0x00); + auto odd = expand_bits<2>(0x0a); + auto even = expand_bits<2>(0x15); + ASSERT_EQ(yes.size(), 5); + ASSERT_EQ(no.size(), 5); + ASSERT_EQ(odd.size(), 5); + ASSERT_EQ(even.size(), 5); + for (size_t i = 0; i < 5; ++i) { + EXPECT_TRUE(yes[i]); + EXPECT_FALSE(no[i]); + EXPECT_EQ(odd[i], bool(i % 2 == 1)); + EXPECT_EQ(even[i], bool(i % 2 == 0)); + } +} + +TEST(TableDfaTest, format_bits) { + EXPECT_EQ(format_vector(expand_bits<1>(0)), fmt("[0,0,0]")); + EXPECT_EQ(format_vector(expand_bits<1>(7)), fmt("[1,1,1]")); + EXPECT_EQ(format_vector(expand_bits<1>(5)), fmt("[1,0,1]")); + EXPECT_EQ(format_vector(expand_bits<1>(2)), fmt("[0,1,0]")); + EXPECT_EQ(format_vector(expand_bits<2>(31)), fmt("[1,1,1,1,1]")); + EXPECT_EQ(format_vector(expand_bits<2>(21)), fmt("[1,0,1,0,1]")); + EXPECT_EQ(format_vector(expand_bits<2>(31), true), fmt("11111")); + EXPECT_EQ(format_vector(expand_bits<2>(21), true), fmt("10101")); +} + +template <uint8_t N> +void list_states(bool count_only = false) { + auto repo = make_state_repo<N>(); + EXPECT_EQ(num_states<N>(), repo.size()); + fprintf(stderr, "max_edits: %u, number of states: %zu\n", N, repo.size()); + if (!count_only) { + for (uint32_t i = 0; i < repo.size(); ++i) { + fprintf(stderr, " state %u: %s\n", i, repo.idx_to_state(i).to_string().c_str()); + } + } +} + +TEST(TableDfaTest, list_states_for_max_edits_1) { list_states<1>(); } +TEST(TableDfaTest, list_states_for_max_edits_2) { list_states<2>(); } +TEST(TableDfaTest, count_states_for_max_edits_3) { list_states<3>(true); } + +template <uint8_t N> +void list_edits() { + auto repo = make_state_repo<N>(); + fprintf(stderr, + "per state, listing the minimal number of edits needed\n" + "to reach offsets at and beyond its minimal boundary\n"); + for (uint32_t i = 0; i < repo.size(); ++i) { + const State &state = repo.idx_to_state(i); + fprintf(stderr, "%-23s : %s\n", state.to_string().c_str(), + format_vector(state.make_edit_vector<N>()).c_str()); + } +} + +TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_1) { list_edits<1>(); } +TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_2) { list_edits<2>(); } + +template <uint8_t N> +void list_transitions() { + auto repo = make_state_repo<N>(); + for (uint32_t idx = 0; idx < repo.size(); ++idx) { + const State &state = repo.idx_to_state(idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + auto bits = expand_bits<N>(i); + State new_state = state.next<N>(bits); + uint32_t step = new_state.normalize(); + uint32_t new_idx = repo.state_to_idx(new_state); + ASSERT_LT(new_idx, repo.size()); + fprintf(stderr, "%u:%s,i --%s--> %u:%s,%s\n", idx, state.to_string().c_str(), + format_vector(bits).c_str(), new_idx, new_state.to_string().c_str(), + (step == 0) ? "i" : fmt("i+%u", step).c_str()); + } + } +} + +TEST(TableDfaTest, list_transitions_for_max_edits_1) { list_transitions<1>(); } + +// Simulate all possible ways we can approach the end of the word we +// are matching. Verify that no transition taken can produce a state +// with a minimal boundary that exceeds the boundary of the word +// itself. Verifying this will enable us to not care about word size +// while simulating the dfa. +template <uint8_t N> +void verify_word_end_boundary() { + auto repo = make_state_repo<N>(); + using StateSet = std::set<uint32_t>; + std::vector<StateSet> active(window_size<N>() + 1); + for (size_t i = 1; i < repo.size(); ++i) { + active[0].insert(i); + } + EXPECT_EQ(active.size(), window_size<N>() + 1); + EXPECT_EQ(active[0].size(), repo.size() - 1); + fprintf(stderr, "verifying word end for max edits %u\n", N); + uint32_t edge_shape = 0; + for (uint32_t active_idx = 0; active_idx < active.size(); ++active_idx) { + fprintf(stderr, " edge shape: %s, max step: %zu, active_states: %zu\n", + format_vector(expand_bits<N>(edge_shape)).c_str(), active.size() - active_idx - 1, active[active_idx].size()); + for (uint32_t idx: active[active_idx]) { + const State &state = repo.idx_to_state(idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + if ((i & edge_shape) == 0) { + State new_state = state.next<N>(expand_bits<N>(i)); + uint32_t step = new_state.normalize(); + uint32_t new_idx = repo.state_to_idx(new_state); + ASSERT_LT(new_idx, repo.size()); + if (new_idx != 0) { + ASSERT_GT(active.size(), active_idx + step); + active[active_idx + step].insert(new_idx); + } + } + } + } + edge_shape = (edge_shape << 1) + 1; + } + EXPECT_EQ(edge_shape, (1 << (window_size<N>() + 1)) - 1); + while (!active.back().empty()) { + fprintf(stderr, " residue states after word end: %zu\n", active.back().size()); + StateSet residue; + for (uint32_t idx: active.back()) { + const State &state = repo.idx_to_state(idx); + State new_state = state.next<N>(expand_bits<N>(0)); + uint32_t step = new_state.normalize(); + uint32_t new_idx = repo.state_to_idx(new_state); + ASSERT_LT(new_idx, repo.size()); + ASSERT_EQ(step, 0); + if (new_idx != 0) { + residue.insert(new_idx); + } + } + active.back() = std::move(residue); + } +} + +TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_1) { + verify_word_end_boundary<1>(); +} + +TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_2) { + verify_word_end_boundary<2>(); +} + +TEST(TableDfaTest, graphviz_for_food_with_max_edits_1) { + auto dfa = LevenshteinDfa::build("food", 1, LevenshteinDfa::Casing::Cased, LevenshteinDfa::DfaType::Table); + std::ostringstream out; + dfa.dump_as_graphviz(out); + fprintf(stderr, "memory usage: %zu\n", dfa.memory_usage()); + fprintf(stderr, "%s", out.str().c_str()); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt index 5e8d29980cd..8ccef84d969 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt @@ -7,6 +7,7 @@ vespa_add_library(vespalib_vespalib_fuzzy OBJECT implicit_levenshtein_dfa.cpp levenshtein_dfa.cpp levenshtein_distance.cpp + table_dfa.cpp unicode_utils.cpp DEPENDS ) diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp index 1caae408176..5f6d0ae9956 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "explicit_levenshtein_dfa.h" #include "implicit_levenshtein_dfa.h" +#include "table_dfa.h" #include "levenshtein_dfa.h" #include "unicode_utils.h" #include <vespa/vespalib/util/stringfmt.h> @@ -53,14 +54,19 @@ LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max } else { // max_edits == 2 return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased)); } - } else { // DfaType::Explicit + } else if(dfa_type == DfaType::Explicit) { if (max_edits == 1) { return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa(); } else { // max_edits == 2 return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa(); } + } else { // DfaType::Table + if (max_edits == 1) { + return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased)); + } else { // max_edits == 2 + return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased)); + } } - } LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) { @@ -87,9 +93,11 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mo std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) { if (dt == LevenshteinDfa::DfaType::Implicit) { os << "Implicit"; - } else { - assert(dt == LevenshteinDfa::DfaType::Explicit); + } else if (dt == LevenshteinDfa::DfaType::Explicit) { os << "Explicit"; + } else { + assert(dt == LevenshteinDfa::DfaType::Table); + os << "Table"; } return os; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h index c6ca06d4de3..88261bef62c 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -231,7 +231,8 @@ public: enum class DfaType { Implicit, - Explicit + Explicit, + Table }; /** diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp new file mode 100644 index 00000000000..943349818fb --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp @@ -0,0 +1,10 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "table_dfa.hpp" + +namespace vespalib::fuzzy { + +template class TableDfa<1>; +template class TableDfa<2>; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h new file mode 100644 index 00000000000..dc511a42d04 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h @@ -0,0 +1,64 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "levenshtein_dfa.h" +#include <vector> + +namespace vespalib::fuzzy { + +/** + * This implementation is based on the paper 'Fast string correction + * with Levenshtein automata' from 2002 by Klaus U. Schulz and Stoyan + * Mihov. + * + * Given the maximal distance N, a generic parameterized transition + * table is calculated up-front. When a specific word is given, a + * simple lookup structure is created to enumerate the possible + * characteristic vectors for each position in the given + * word. Together, these structures can be used to simulate the + * traversal of a hypothetical Levenshtein dfa that will never be + * created. + * + * Approaching the end of the word is handled by padding the + * characteristic vectors with 0 bits for everything after the word + * ends. In addition, a unit test verifies that there is no possible + * sequence of events that leads to the minimal boundary of the state + * exceeding the boundary of the word itself. This means that the + * simulated dfa can be stepped freely without checking for word size. + **/ +template <uint8_t N> +class TableDfa final : public LevenshteinDfa::Impl +{ +public: + // characteristic vector for a specific input value indicating how + // it matches the window starting at the minimal boundary. + struct CV { + uint32_t input; + uint32_t match; + CV() noexcept : input(0), match(0) {} + }; + static constexpr size_t window_size() { return 2 * N + 1; } + struct Lookup { + std::array<CV, window_size()> list; + Lookup() noexcept : list() {} + }; + +private: + const void *_tfa; + const std::vector<Lookup> _lookup; + const bool _is_cased; + + static std::vector<Lookup> make_lookup(const std::vector<uint32_t> &str); + +public: + using MatchResult = LevenshteinDfa::MatchResult; + TableDfa(std::vector<uint32_t> str, bool is_cased); + ~TableDfa() override; + [[nodiscard]] MatchResult match(std::string_view source) const override; + [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override; + [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const override; + [[nodiscard]] size_t memory_usage() const noexcept override; + void dump_as_graphviz(std::ostream& os) const override; +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp new file mode 100644 index 00000000000..0693343007e --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp @@ -0,0 +1,497 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "table_dfa.h" +#include "match_algorithm.hpp" +#include <vespa/vespalib/util/stringfmt.h> +#include <cassert> +#include <stdexcept> +#include <algorithm> +#include <map> + +namespace vespalib::fuzzy { + +namespace { + +using vespalib::make_string_short::fmt; + +// It is useful to know the number of states compile time to be able +// to pack lookup tables better. +template <uint8_t N> constexpr size_t num_states(); +template <> constexpr size_t num_states<1>() { return 6; } +template <> constexpr size_t num_states<2>() { return 31; } +template <> constexpr size_t num_states<3>() { return 197; } + +template <uint8_t N> constexpr size_t window_size() { return 2 * N + 1; } +template <uint8_t N> constexpr size_t num_transitions() { return 1 << window_size<N>(); } + + +auto diff(auto a, auto b) { return (a > b) ? (a - b) : (b - a); } + +// A Position combines an index into a word being matched with the +// number of edits needed to get there. This maps directly onto a +// specific state in the NFA used to match a word. Note that the sort +// order prefers low edits over low indexs. This is to ensure that a +// position that subsumes another position will always sort before it. +struct Position { + uint32_t index; + uint32_t edits; + Position(uint32_t index_in, uint32_t edits_in) noexcept + : index(index_in), edits(edits_in) {} + static Position start() noexcept { return Position(0,0); } + bool subsumes(const Position &rhs) const noexcept { + if (edits >= rhs.edits) { + return false; + } + return diff(index, rhs.index) <= (rhs.edits - edits); + } + Position materialize(uint32_t target_index) const noexcept { + return Position(target_index, edits + diff(index, target_index)); + } + bool operator==(const Position &rhs) const noexcept { + return (index == rhs.index) && (edits == rhs.edits); + } + bool operator<(const Position &rhs) const noexcept { + return std::tie(edits,index) < std::tie(rhs.edits, rhs.index); + } + template <uint8_t N> + void add_elementary_transitions(const std::vector<bool> &bits, std::vector<Position> &dst) const { + assert(bits.size() > index); + if (!bits[index]) { + dst.emplace_back(index, edits + 1); + dst.emplace_back(index + 1, edits + 1); + } + for (uint32_t e = 0; (edits + e) <= N; ++e) { + assert(bits.size() > (index + e)); + if (bits[index + e]) { + dst.emplace_back(index + e + 1, edits + e); + } + } + } + vespalib::string to_string() const { return fmt("%u#%u", index, edits); } +}; + +// A State is a collection of different Positions that do not subsume +// each other. If the minimal boundary of a state is larger than 0, it +// can be lifted from the state in a normalizing operation that will +// renumber the position indexes such that the minimal boundary of the +// state becomes 0. This is to allow parameterized states where the +// general progress of matching the string (minimal boundary of +// non-normalized state) is untangled from the local competing edit +// alternatives (normalized state). +struct State { + std::vector<Position> list; + State() noexcept : list() {} + static State failed() noexcept { return State(); } + static State start() { + State result; + result.list.push_back(Position::start()); + return result; + } + bool operator<(const State &rhs) const { + return list < rhs.list; + } + uint32_t minimal_boundary() const noexcept { + if (list.empty()) { + return 0; + } + uint32_t min = list[0].index; + for (size_t i = 1; i < list.size(); ++i) { + min = std::min(min, list[i].index); + } + return min; + } + uint32_t normalize() { + uint32_t min = minimal_boundary(); + if (min > 0) { + for (auto &entry: list) { + entry.index -= min; + } + } + return min; + } + template <uint8_t N> + static State create(std::vector<Position> list_in) { + State result; + auto want = [&result](Position pos) { + if (pos.edits > N) { + return false; + } + for (const auto &old_pos: result.list) { + if (old_pos == pos || old_pos.subsumes(pos)) { + return false; + } + } + return true; + }; + std::sort(list_in.begin(), list_in.end()); + for (const auto &pos: list_in) { + if (want(pos)) { + result.list.push_back(pos); + } + } + return result; + } + template <uint8_t N> + State next(const std::vector<bool> &bits) const { + std::vector<Position> tmp; + for (const auto &pos: list) { + pos.add_elementary_transitions<N>(bits, tmp); + } + return create<N>(std::move(tmp)); + } + template <uint8_t N> + std::vector<uint8_t> make_edit_vector() const { + std::vector<uint8_t> result(window_size<N>(), N + 1); + for (const auto &pos: list) { + for (uint32_t i = 0; i < window_size<N>(); ++i) { + result[i] = std::min(result[i], uint8_t(pos.materialize(i).edits)); + } + } + return result; + } + vespalib::string to_string() const { + vespalib::string result = "{"; + for (size_t i = 0; i < list.size(); ++i) { + if (i > 0) { + result.append(","); + } + result.append(list[i].to_string()); + } + result.append("}"); + return result; + } +}; + +// Keeps track of unique states, assigning an integer value to each +// state. Only states with minimal boundary 0 is allowed to be +// inserted into a state repo. Each repo is seeded with the empty +// state (0) and the start state (1). An assigned integer value can be +// mapped back into the originating state. +struct StateRepo { + using Map = std::map<State,uint32_t>; + using Ref = Map::iterator; + Map seen; + std::vector<Ref> refs; + StateRepo() noexcept : seen(), refs() { + auto failed_idx = state_to_idx(State::failed()); + auto start_idx = state_to_idx(State::start()); + assert(failed_idx == 0); + assert(start_idx == 1); + } + ~StateRepo(); + size_t size() const { return seen.size(); } + uint32_t state_to_idx(const State &state) { + assert(state.minimal_boundary() == 0); + uint32_t next = refs.size(); + auto [pos, inserted] = seen.emplace(state, next); + if (inserted) { + refs.push_back(pos); + } + assert(seen.size() == refs.size()); + return pos->second; + } + const State &idx_to_state(uint32_t idx) const { + assert(idx < refs.size()); + return refs[idx]->first; + } +}; +StateRepo::~StateRepo() = default; + +template <uint8_t N> +std::vector<bool> expand_bits(uint32_t value) { + static_assert(N < 10); + std::vector<bool> result(window_size<N>()); + uint32_t look_for = num_transitions<N>(); + assert(value < look_for); + for (size_t i = 0; i < result.size(); ++i) { + look_for >>= 1; + result[i] = (value & look_for); + } + return result; +} + +template <uint8_t N> +[[maybe_unused]] StateRepo make_state_repo() { + StateRepo repo; + for (uint32_t idx = 0; idx < repo.size(); ++idx) { + const State &state = repo.idx_to_state(idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + State new_state = state.next<N>(expand_bits<N>(i)); + (void) new_state.normalize(); + (void) repo.state_to_idx(new_state); + } + } + return repo; +} + +// this is the result of our efforts +template <uint8_t N> +struct Tfa { + struct Entry { + uint8_t step; + uint8_t state; + }; + + // what happens when following a transition from a state? + std::array<std::array<Entry,num_transitions<N>()>,num_states<N>()> table; + + // how many edits did we use to match the target word? + std::array<std::array<uint8_t,window_size<N>()>,num_states<N>()> edits; +}; + +template <uint8_t N> +std::unique_ptr<Tfa<N>> make_tfa() { + auto tfa = std::make_unique<Tfa<N>>(); + StateRepo repo; + uint32_t state_idx = 0; + for (; state_idx < repo.size(); ++state_idx) { + const State &state = repo.idx_to_state(state_idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + State new_state = state.next<N>(expand_bits<N>(i)); + uint32_t step = new_state.normalize(); + uint32_t new_state_idx = repo.state_to_idx(new_state); + assert(step < 256); + assert(new_state_idx < 256); + tfa->table[state_idx][i].step = step; + tfa->table[state_idx][i].state = new_state_idx; + } + auto edits = state.make_edit_vector<N>(); + assert(edits.size() == window_size<N>()); + for (uint32_t i = 0; i < window_size<N>(); ++i) { + tfa->edits[state_idx][i] = edits[i]; + } + } + assert(repo.size() == num_states<N>()); + assert(state_idx == num_states<N>()); + return tfa; +} + +template <uint8_t N> +const Tfa<N> *get_tfa() { + static std::unique_ptr<Tfa<N>> tfa = make_tfa<N>(); + return tfa.get(); +} + +template <typename T> +vespalib::string format_vector(const std::vector<T> &vector, bool compact = false) { + vespalib::string str = compact ? "" : "["; + for (size_t i = 0; i < vector.size(); ++i) { + if (i > 0 && !compact) { + str.append(","); + } + str.append(fmt("%u", uint32_t(vector[i]))); + } + if (!compact) { + str.append("]"); + } + return str; +} + +template <uint8_t N> +struct TableMatcher { + struct S { + uint32_t index; + uint32_t state; + // needed by dfa matcher concept (should use std::declval instead) + constexpr S() noexcept : index(0), state(0) {} + constexpr S(uint32_t i, uint32_t s) noexcept : index(i), state(s) {} + S next(const Tfa<N> *tfa, uint32_t bits) noexcept { + auto entry = tfa->table[state][bits]; + return S(index + entry.step, entry.state); + } + constexpr bool is_valid_edge(const Tfa<N> *tfa, uint32_t bits) const noexcept { + return tfa->table[state][bits].state != 0; + } + }; + using StateType = S; + using StateParamType = StateType; + using EdgeType = uint32_t; + + const Tfa<N> *tfa; + const TableDfa<N>::Lookup *lookup; + const uint32_t end; + const bool cased; + + TableMatcher(const Tfa<N> *tfa_in, const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in) + noexcept : tfa(tfa_in), lookup(lookup_in), end(end_in), cased(cased_in) {} + + bool is_cased() const noexcept { return cased; } + static constexpr S start() noexcept { return S(0, 1); } + + uint8_t match_edit_distance(S s) const noexcept { + uint32_t leap = (end - s.index); + return (leap < window_size<N>()) ? tfa->edits[s.state][leap] : N + 1; + } + bool is_match(S s) const noexcept { return match_edit_distance(s) <= N; } + + static constexpr bool can_match(S s) noexcept { return (s.state != 0); } + static constexpr bool valid_state(S s) noexcept { return (s.state != 0); } + + S match_wildcard(S s) const noexcept { return s.next(tfa, 0); } + S match_input(S s, uint32_t c) const noexcept { + const auto *slice = lookup[s.index].list.data(); + for (size_t i = 0; i < window_size<N>() && slice[i].input != 0; ++i) { + if (slice[i].input == c) { + return s.next(tfa, slice[i].match); + } + } + return match_wildcard(s); + } + + bool has_higher_out_edge(S s, uint32_t c) const noexcept { + if (s.is_valid_edge(tfa, 0)) { + return true; + } + const auto *slice = lookup[s.index].list.data(); + for (size_t i = 0; i < window_size<N>() && slice[i].input > c; ++i) { + if (s.is_valid_edge(tfa, slice[i].match)) { + return true; + } + } + return false; + } + + bool has_exact_explicit_out_edge(S s, uint32_t c) const noexcept { + const auto *slice = lookup[s.index].list.data(); + for (size_t i = 0; i < window_size<N>() && slice[i].input >= c; ++i) { + if (slice[i].input == c) { + return s.is_valid_edge(tfa, slice[i].match); + } + } + return false; + } + + uint32_t lowest_higher_explicit_out_edge(S s, uint32_t c) const noexcept { + const auto *slice = lookup[s.index].list.data(); + size_t i = window_size<N>(); + while (i-- > 0) { + if (slice[i].input > c && s.is_valid_edge(tfa, slice[i].match)) { + return slice[i].input; + } + } + return 0; + } + + uint32_t smallest_explicit_out_edge(S s) const noexcept { + const auto *slice = lookup[s.index].list.data(); + size_t i = window_size<N>(); + while (i-- > 0) { + if (slice[i].input != 0 && s.is_valid_edge(tfa, slice[i].match)) { + return slice[i].input; + } + } + return 0; + } + + static constexpr bool valid_edge(uint32_t c) noexcept { return c != 0; } + static constexpr uint32_t edge_to_u32char(uint32_t c) noexcept { return c; } + S edge_to_state(S s, uint32_t c) const noexcept { return match_input(s, c); } + + static constexpr bool implies_exact_match_suffix(S) noexcept { return false; } + static constexpr void emit_exact_match_suffix(S, std::vector<uint32_t> &) {} // not called + static constexpr void emit_exact_match_suffix(S, std::string &) {} // not called +}; + +} // unnamed + +template <uint8_t N> +auto +TableDfa<N>::make_lookup(const std::vector<uint32_t> &str)->std::vector<Lookup> +{ + std::vector<Lookup> result(str.size() + 1); + auto have_already = [&](uint32_t c, size_t i)noexcept{ + for (size_t j = 0; j < window_size(); ++j) { + if (result[i].list[j].input == c) { + return true; + } + } + return false; + }; + auto make_vector = [&](uint32_t c, size_t i)noexcept{ + uint32_t bits = 0; + for (size_t j = 0; j < window_size(); ++j) { + bool found = ((i + j) < str.size()) && (str[i+j] == c); + bits = (bits << 1) + found; + } + return bits; + }; + for (size_t i = 0; i < str.size(); ++i) { + for (size_t j = 0; j < window_size(); ++j) { + assert(result[i].list[j].input == 0); + assert(result[i].list[j].match == 0); + if ((i + j) < str.size()) { + uint32_t c = str[i + j]; + if (!have_already(c, i)) { + result[i].list[j].input = c; + result[i].list[j].match = make_vector(c, i); + } + } + } + std::sort(result[i].list.begin(), result[i].list.end(), + [](const auto &a, const auto &b){ return a.input > b.input; }); + } + return result; +} + +template <uint8_t N> +TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased) + : _tfa(get_tfa<N>()), + _lookup(make_lookup(str)), + _is_cased(is_cased) +{ +} + +template <uint8_t N> +TableDfa<N>::~TableDfa() = default; + +template <uint8_t N> +LevenshteinDfa::MatchResult +TableDfa<N>::match(std::string_view u8str) const +{ + TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased); + return MatchAlgorithm<N>::match(matcher, u8str); +} + +template <uint8_t N> +LevenshteinDfa::MatchResult +TableDfa<N>::match(std::string_view u8str, std::string& successor_out) const +{ + TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased); + return MatchAlgorithm<N>::match(matcher, u8str, successor_out); +} + +template <uint8_t N> +LevenshteinDfa::MatchResult +TableDfa<N>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const +{ + TableMatcher matcher(static_cast<const Tfa<N>*>(_tfa), _lookup.data(), _lookup.size() - 1, _is_cased); + return MatchAlgorithm<N>::match(matcher, u8str, successor_out); +} + +template <uint8_t N> +size_t +TableDfa<N>::memory_usage() const noexcept +{ + return _lookup.size() * sizeof(Lookup); +} + +template <uint8_t N> +void +TableDfa<N>::dump_as_graphviz(std::ostream &os) const +{ + os << std::dec << "digraph table_dfa {\n"; + for (size_t i = 0; i < _lookup.size(); ++i) { + for (size_t j = 0; j < window_size(); ++j) { + if (_lookup[i].list[j].input != 0) { + std::string as_utf8; + append_utf32_char(as_utf8, _lookup[i].list[j].input); + os << " x" << i << " -> " << _lookup[i].list[j].match << " [label=\"" << as_utf8 << "\"];\n"; + } + } + os << " x" << i << " -> 0 [label=\"*\"];\n"; + } + os << "}\n"; +} + +} |