diff options
Diffstat (limited to 'vespalib')
33 files changed, 1348 insertions, 334 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index c1d7e17b457..56dcd9abdf6 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -96,6 +96,7 @@ vespa_define_module( src/tests/fileheader src/tests/floatingpointtype src/tests/fuzzy + src/tests/fuzzy/table_dfa src/tests/gencnt src/tests/growablebytebuffer src/tests/guard diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp index c235cb99509..919ba328085 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -11,6 +11,7 @@ #include <string> #include <string_view> #include <gtest/gtest.h> +#include <gmock/gmock.h> using namespace ::testing; using namespace vespalib::fuzzy; @@ -82,7 +83,8 @@ INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes, Combine(Values(LevenshteinDfa::Casing::Uncased, LevenshteinDfa::Casing::Cased), Values(LevenshteinDfa::DfaType::Explicit, - LevenshteinDfa::DfaType::Implicit)), + LevenshteinDfa::DfaType::Implicit, + LevenshteinDfa::DfaType::Table)), LevenshteinDfaTest::stringify_params); // Same as existing non-DFA Levenshtein tests, but with some added instantiations @@ -122,8 +124,10 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt); } -void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) { - std::string successor; +void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, + std::string_view expected_successor, std::string_view successor_prefix) +{ + std::string successor(successor_prefix); auto m = dfa.match(source, successor); if (m.matches()) { FAIL() << "Expected '" << source << "' to emit a successor, but it " @@ -131,15 +135,21 @@ void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std: << " edits (of max " << static_cast<uint32_t>(m.max_edits()) << " edits)"; } EXPECT_EQ(successor, expected_successor); - EXPECT_TRUE(dfa.match(successor).matches()); + // Must skip any caller-provided successor prefix before checking if it matches the target + auto successor_suffix = successor.substr(successor_prefix.size()); + EXPECT_TRUE(dfa.match(successor_suffix).matches()); // Make sure the UTF-32 successor output is codepoint-wise identical to the UTF-8 successor - std::vector<uint32_t> u32successor; + std::vector<uint32_t> u32successor(utf8_string_to_utf32(successor_prefix)); m = dfa.match(source, u32successor); EXPECT_FALSE(m.matches()); expect_utf32_string_code_point_equal_to_utf8(u32successor, successor); } +void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) { + test_dfa_successor(dfa, source, expected_successor, {}); +} + TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) { auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); @@ -201,6 +211,28 @@ TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) { test_dfa_successor(dfa, "vespa", "w"); } +TEST_P(LevenshteinDfaTest, caller_provided_successor_prefix_is_preserved_on_mismatch) { + auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); + + // Same inputs as existing successor tests, but with a preserved prefix in the generated successor + test_dfa_successor(dfa, "", "yolo\x01""food", "yolo"); + test_dfa_successor(dfa, "faa", "xyzfaod", "xyz"); + test_dfa_successor(dfa, "fooooo", "ABCfoop", "ABC"); + test_dfa_successor(dfa, "ooof", "ABCpfood", "ABC"); + test_dfa_successor(dfa, "gp", "yolohfood", "yolo"); + + dfa = LevenshteinDfa::build("", 1, casing(), dfa_type()); + test_dfa_successor(dfa, "aa", "foob", "foo"); +} + +TEST_P(LevenshteinDfaTest, caller_provided_successor_prefix_is_preserved_on_match) { + auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); + std::string successor = "bar"; + auto m = dfa.match("mood", successor); + EXPECT_TRUE(m.matches()); + EXPECT_THAT(successor, StartsWith("bar")); +} + // We should normally be able to rely on higher-level components to ensure we // only receive valid UTF-8, but make sure we don't choke on it if we do get it. TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) { @@ -233,7 +265,8 @@ struct LevenshteinDfaCasingTest : TestWithParam<LevenshteinDfa::DfaType> { INSTANTIATE_TEST_SUITE_P(AllDfaTypes, LevenshteinDfaCasingTest, Values(LevenshteinDfa::DfaType::Explicit, - LevenshteinDfa::DfaType::Implicit), + LevenshteinDfa::DfaType::Implicit, + LevenshteinDfa::DfaType::Table), PrintToStringParamName()); TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) { @@ -315,7 +348,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, Combine(Values(LevenshteinDfa::Casing::Uncased, LevenshteinDfa::Casing::Cased), Values(LevenshteinDfa::DfaType::Explicit, - LevenshteinDfa::DfaType::Implicit), + LevenshteinDfa::DfaType::Implicit, + LevenshteinDfa::DfaType::Table), Values(1, 2)), LevenshteinDfaSuccessorTest::stringify_params); @@ -342,6 +376,7 @@ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) { std::string skip_to, successor; for (uint32_t j = 0; j < 256; ++j) { const auto source = bits_to_str(static_cast<uint8_t>(j)); + successor.clear(); auto maybe_match = target_dfa.match(source, successor); if (maybe_match.matches() && !skip_to.empty()) { ASSERT_GE(source, skip_to); @@ -597,6 +632,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) { auto end = dict.cend(); std::string successor; while (iter != end) { + successor.clear(); auto maybe_match = dfa.match(*iter, successor); if (maybe_match.matches()) { ++iter; diff --git a/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt new file mode 100644 index 00000000000..1017ac99564 --- /dev/null +++ b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_fuzzy_table_dfa_test_app TEST + SOURCES + table_dfa_test.cpp + DEPENDS + vespalib + GTest::GTest + ) +vespa_add_test(NAME vespalib_fuzzy_table_dfa_test_app COMMAND vespalib_fuzzy_table_dfa_test_app) diff --git a/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp new file mode 100644 index 00000000000..acb68a56de7 --- /dev/null +++ b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp @@ -0,0 +1,395 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/fuzzy/table_dfa.hpp> +#include <vespa/vespalib/gtest/gtest.h> +#include <set> + +using namespace ::testing; +using namespace vespalib::fuzzy; + +// test/experiment with low-level concepts underlying the construction +// of the tables used in the table-driven dfa implementation. + +TEST(TableDfaTest, position) { + Position pos1 = Position::start(); + EXPECT_EQ(pos1.index, 0); + EXPECT_EQ(pos1.edits, 0); + Position pos2(2, 3); + EXPECT_EQ(pos2.index, 2); + EXPECT_EQ(pos2.edits, 3); +} + +TEST(TableDfaTest, position_equality) { + Position pos1(0, 0); + Position pos2(0, 1); + Position pos3(1, 0); + EXPECT_TRUE(pos1 == pos1); + EXPECT_FALSE(pos1 == pos2); + EXPECT_FALSE(pos1 == pos2); +} + +TEST(TableDfaTest, position_sort_order) { + std::vector<Position> list; + list.emplace_back(0,1); + list.emplace_back(0,0); + list.emplace_back(1,0); + list.emplace_back(1,1); + std::sort(list.begin(), list.end()); + EXPECT_EQ(list[0].index, 0); + EXPECT_EQ(list[0].edits, 0); + EXPECT_EQ(list[1].index, 1); + EXPECT_EQ(list[1].edits, 0); + EXPECT_EQ(list[2].index, 0); + EXPECT_EQ(list[2].edits, 1); + EXPECT_EQ(list[3].index, 1); + EXPECT_EQ(list[3].edits, 1); +} + +TEST(TableDfaTest, position_subsumption) { + Position pos1(0, 0); + Position pos2(0, 1); + Position pos3(0, 2); + + Position pos4(1, 0); + Position pos5(1, 1); + Position pos6(1, 2); + + Position pos7(2, 0); + Position pos8(2, 1); + Position pos9(2, 2); + + EXPECT_FALSE(pos1.subsumes(pos1)); + EXPECT_TRUE(pos1.subsumes(pos2)); + EXPECT_TRUE(pos1.subsumes(pos3)); + EXPECT_FALSE(pos1.subsumes(pos4)); + EXPECT_TRUE(pos1.subsumes(pos5)); + EXPECT_TRUE(pos1.subsumes(pos6)); + EXPECT_FALSE(pos1.subsumes(pos7)); + EXPECT_FALSE(pos1.subsumes(pos8)); + EXPECT_TRUE(pos1.subsumes(pos9)); + + EXPECT_FALSE(pos5.subsumes(pos1)); + EXPECT_FALSE(pos5.subsumes(pos2)); + EXPECT_TRUE(pos5.subsumes(pos3)); + EXPECT_FALSE(pos5.subsumes(pos4)); + EXPECT_FALSE(pos5.subsumes(pos5)); + EXPECT_TRUE(pos5.subsumes(pos6)); + EXPECT_FALSE(pos5.subsumes(pos7)); + EXPECT_FALSE(pos5.subsumes(pos8)); + EXPECT_TRUE(pos5.subsumes(pos9)); +} + +TEST(TableDfaTest, position_materialization) { + EXPECT_EQ(Position(1,1).materialize(0).index, 0); + EXPECT_EQ(Position(1,1).materialize(1).index, 1); + EXPECT_EQ(Position(1,1).materialize(2).index, 2); + EXPECT_EQ(Position(1,1).materialize(0).edits, 2); + EXPECT_EQ(Position(1,1).materialize(1).edits, 1); + EXPECT_EQ(Position(1,1).materialize(2).edits, 2); +} + +TEST(TableDfaTest, position_to_string) { + Position pos1(0, 0); + Position pos2(1, 2); + Position pos3(2, 3); + EXPECT_EQ(pos1.to_string(), fmt("0#0")); + EXPECT_EQ(pos2.to_string(), fmt("1#2")); + EXPECT_EQ(pos3.to_string(), fmt("2#3")); +} + +TEST(TableDfaTest, state_creation_reorder) { + EXPECT_EQ(State::create<5>({{0,1},{2,0}}).to_string(), fmt("{2#0,0#1}")); + EXPECT_EQ(State::create<5>({{2,0},{0,0}}).to_string(), fmt("{0#0,2#0}")); +} + +TEST(TableDfaTest, state_creation_duplicate_removal) { + EXPECT_EQ(State::create<5>({{0,0},{0,0},{2,1},{2,1}}).to_string(), fmt("{0#0,2#1}")); +} + +TEST(TableDfaTest, state_creation_edit_cutoff) { + EXPECT_EQ(State::create<2>({{0,0},{5,2},{10,3}}).to_string(), fmt("{0#0,5#2}")); +} + +TEST(TableDfaTest, state_creation_subsumption_collapsing) { + EXPECT_EQ(State::create<2>({{0,0},{1,1}}).to_string(), fmt("{0#0}")); + EXPECT_EQ(State::create<2>({{0,1},{1,0}}).to_string(), fmt("{1#0}")); + EXPECT_EQ(State::create<2>({{0,0},{2,2}}).to_string(), fmt("{0#0}")); + EXPECT_EQ(State::create<2>({{0,2},{2,0}}).to_string(), fmt("{2#0}")); +} + +TEST(TableDfaTest, state_normalization) { + auto state1 = State::create<2>({{2,1},{3,1}}); + auto state2 = State::create<2>({{5,0},{3,1}}); + EXPECT_EQ(state1.to_string(), fmt("{2#1,3#1}")); + EXPECT_EQ(state2.to_string(), fmt("{5#0,3#1}")); + EXPECT_EQ(state1.normalize(), 2); + EXPECT_EQ(state2.normalize(), 3); + EXPECT_EQ(state1.to_string(), fmt("{0#1,1#1}")); + EXPECT_EQ(state2.to_string(), fmt("{2#0,0#1}")); +} + +TEST(TableDfaTest, state_repo) { + StateRepo repo; + EXPECT_EQ(repo.state_to_idx(State::failed()), 0); + EXPECT_EQ(repo.state_to_idx(State::start()), 1); + EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2); + EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3); + EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2); + EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3); + EXPECT_EQ(repo.size(), 4); + EXPECT_EQ(repo.idx_to_state(0).to_string(), fmt("{}")); + EXPECT_EQ(repo.idx_to_state(1).to_string(), fmt("{0#0}")); + EXPECT_EQ(repo.idx_to_state(2).to_string(), fmt("{0#0,1#0}")); + EXPECT_EQ(repo.idx_to_state(3).to_string(), fmt("{0#0,2#1}")); +} + +TEST(TableDfaTest, expand_bits) { + auto yes = expand_bits<2>(0x1f); + auto no = expand_bits<2>(0x00); + auto odd = expand_bits<2>(0x0a); + auto even = expand_bits<2>(0x15); + ASSERT_EQ(yes.size(), 5); + ASSERT_EQ(no.size(), 5); + ASSERT_EQ(odd.size(), 5); + ASSERT_EQ(even.size(), 5); + for (size_t i = 0; i < 5; ++i) { + EXPECT_TRUE(yes[i]); + EXPECT_FALSE(no[i]); + EXPECT_EQ(odd[i], bool(i % 2 == 1)); + EXPECT_EQ(even[i], bool(i % 2 == 0)); + } +} + +TEST(TableDfaTest, format_bits) { + EXPECT_EQ(format_vector(expand_bits<1>(0)), fmt("[0,0,0]")); + EXPECT_EQ(format_vector(expand_bits<1>(7)), fmt("[1,1,1]")); + EXPECT_EQ(format_vector(expand_bits<1>(5)), fmt("[1,0,1]")); + EXPECT_EQ(format_vector(expand_bits<1>(2)), fmt("[0,1,0]")); + EXPECT_EQ(format_vector(expand_bits<2>(31)), fmt("[1,1,1,1,1]")); + EXPECT_EQ(format_vector(expand_bits<2>(21)), fmt("[1,0,1,0,1]")); + EXPECT_EQ(format_vector(expand_bits<2>(31), true), fmt("11111")); + EXPECT_EQ(format_vector(expand_bits<2>(21), true), fmt("10101")); +} + +template <uint8_t N> +void list_states() { + auto repo = make_state_repo<N>(); + EXPECT_EQ(num_states<N>(), repo.size()); + fprintf(stderr, "max_edits: %u, number of states: %zu\n", N, repo.size()); + for (uint32_t i = 0; i < repo.size(); ++i) { + fprintf(stderr, " state %u: %s\n", i, repo.idx_to_state(i).to_string().c_str()); + } +} + +TEST(TableDfaTest, list_states_for_max_edits_1) { list_states<1>(); } +TEST(TableDfaTest, list_states_for_max_edits_2) { list_states<2>(); } + +template <uint8_t N> +void list_edits() { + auto repo = make_state_repo<N>(); + fprintf(stderr, + "per state, listing the minimal number of edits needed\n" + "to reach offsets at and beyond its minimal boundary\n"); + for (uint32_t i = 0; i < repo.size(); ++i) { + const State &state = repo.idx_to_state(i); + fprintf(stderr, "%-23s : %s\n", state.to_string().c_str(), + format_vector(state.make_edit_vector<N>()).c_str()); + } +} + +TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_1) { list_edits<1>(); } +TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_2) { list_edits<2>(); } + +template <uint8_t N> +void list_transitions() { + auto repo = make_state_repo<N>(); + for (uint32_t idx = 0; idx < repo.size(); ++idx) { + const State &state = repo.idx_to_state(idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + auto bits = expand_bits<N>(i); + State new_state = state.next<N>(bits); + uint32_t step = new_state.normalize(); + uint32_t new_idx = repo.state_to_idx(new_state); + ASSERT_LT(new_idx, repo.size()); + fprintf(stderr, "%u:%s,i --%s--> %u:%s,%s\n", idx, state.to_string().c_str(), + format_vector(bits).c_str(), new_idx, new_state.to_string().c_str(), + (step == 0) ? "i" : fmt("i+%u", step).c_str()); + } + } +} + +TEST(TableDfaTest, list_transitions_for_max_edits_1) { list_transitions<1>(); } + +// Simulate all possible ways we can approach the end of the word we +// are matching. Verify that no transition taken can produce a state +// with a minimal boundary that exceeds the boundary of the word +// itself. Verifying this will enable us to not care about word size +// while simulating the dfa. +template <uint8_t N> +void verify_word_end_boundary() { + auto repo = make_state_repo<N>(); + using StateSet = std::set<uint32_t>; + std::vector<StateSet> active(window_size<N>() + 1); + for (size_t i = 1; i < repo.size(); ++i) { + active[0].insert(i); + } + EXPECT_EQ(active.size(), window_size<N>() + 1); + EXPECT_EQ(active[0].size(), repo.size() - 1); + fprintf(stderr, "verifying word end for max edits %u\n", N); + uint32_t edge_shape = 0; + for (uint32_t active_idx = 0; active_idx < active.size(); ++active_idx) { + fprintf(stderr, " edge shape: %s, max step: %zu, active_states: %zu\n", + format_vector(expand_bits<N>(edge_shape)).c_str(), active.size() - active_idx - 1, active[active_idx].size()); + for (uint32_t idx: active[active_idx]) { + const State &state = repo.idx_to_state(idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + if ((i & edge_shape) == 0) { + State new_state = state.next<N>(expand_bits<N>(i)); + uint32_t step = new_state.normalize(); + uint32_t new_idx = repo.state_to_idx(new_state); + ASSERT_LT(new_idx, repo.size()); + if (new_idx != 0) { + ASSERT_GT(active.size(), active_idx + step); + active[active_idx + step].insert(new_idx); + } + } + } + } + edge_shape = (edge_shape << 1) + 1; + } + EXPECT_EQ(edge_shape, (1 << (window_size<N>() + 1)) - 1); + while (!active.back().empty()) { + fprintf(stderr, " residue states after word end: %zu\n", active.back().size()); + StateSet residue; + for (uint32_t idx: active.back()) { + const State &state = repo.idx_to_state(idx); + State new_state = state.next<N>(expand_bits<N>(0)); + uint32_t step = new_state.normalize(); + uint32_t new_idx = repo.state_to_idx(new_state); + ASSERT_LT(new_idx, repo.size()); + ASSERT_EQ(step, 0); + if (new_idx != 0) { + residue.insert(new_idx); + } + } + active.back() = std::move(residue); + } +} + +TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_1) { + verify_word_end_boundary<1>(); +} + +TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_2) { + verify_word_end_boundary<2>(); +} + +template <uint8_t N> +void verify_inline_tfa() { + auto tfa = make_tfa<N>(); + fprintf(stderr, "verifying TFA for N = %u (byte size: %zu)\n", N, sizeof(*tfa)); + ASSERT_EQ(tfa->table.size(), num_states<N>()); + ASSERT_EQ(tfa->edits.size(), num_states<N>()); + for (size_t state = 0; state < num_states<N>(); ++state) { + ASSERT_EQ(tfa->table[state].size(), num_transitions<N>()); + for (size_t transition = 0; transition < num_transitions<N>(); ++transition) { + EXPECT_EQ(tfa->table[state][transition].step, InlineTfa<N>::table[state][transition].step); + EXPECT_EQ(tfa->table[state][transition].state, InlineTfa<N>::table[state][transition].state); + } + ASSERT_EQ(tfa->edits[state].size(), window_size<N>()); + for (size_t offset = 0; offset < window_size<N>(); ++offset) { + EXPECT_EQ(tfa->edits[state][offset], InlineTfa<N>::edits[state][offset]); + } + } +} + +TEST(TableDfaTest, verify_inline_tfa_with_max_edits_1) { + verify_inline_tfa<1>(); +} + +TEST(TableDfaTest, verify_inline_tfa_with_max_edits_2) { + verify_inline_tfa<2>(); +} + +template <uint8_t N> +void dump_tfa_as_code() { + auto tfa = make_tfa<N>(); + fprintf(stderr, "// start of auto-generated code for N = %u\n", N); + fprintf(stderr, "template <> struct InlineTfa<%u> {\n", N); + fprintf(stderr, " static constexpr Transition table[%zu][%zu] = {\n", num_states<N>(), num_transitions<N>()); + for (size_t state = 0; state < num_states<N>(); ++state) { + fprintf(stderr, " {"); + for (size_t transition = 0; transition < num_transitions<N>(); ++transition) { + if (transition > 0) { + fprintf(stderr, ","); + } + fprintf(stderr, "{%u,%u}", tfa->table[state][transition].step, tfa->table[state][transition].state); + } + fprintf(stderr, "}%s\n", ((state + 1) < num_states<N>()) ? "," : ""); + } + fprintf(stderr, " };\n"); + fprintf(stderr, " static constexpr uint8_t edits[%zu][%zu] = {\n", num_states<N>(), window_size<N>()); + for (size_t state = 0; state < num_states<N>(); ++state) { + fprintf(stderr, " {"); + for (size_t offset = 0; offset < window_size<N>(); ++offset) { + if (offset > 0) { + fprintf(stderr, ","); + } + fprintf(stderr, "%u", tfa->edits[state][offset]); + } + fprintf(stderr, "}%s\n", ((state + 1) < num_states<N>()) ? "," : ""); + } + fprintf(stderr, " };\n"); + fprintf(stderr, "};\n"); + fprintf(stderr, "// end of auto-generated code for N = %u\n", N); +} + +TEST(TableDfaTest, dump_tfa_with_max_edits_1_as_code) { + dump_tfa_as_code<1>(); +} + +TEST(TableDfaTest, dump_tfa_with_max_edits_2_as_code) { + dump_tfa_as_code<2>(); +} + +template <uint8_t N> +void dump_tfa_graph() { + auto repo = make_state_repo<N>(); + fprintf(stderr, "digraph tfa {\n"); + for (uint32_t idx = 0; idx < repo.size(); ++idx) { + fprintf(stderr, " %u [label=\"%s\"];\n", idx, + repo.idx_to_state(idx).to_string().c_str()); + } + // omit transitions from the failure state to itself + for (uint32_t idx = 1; idx < repo.size(); ++idx) { + const State &state = repo.idx_to_state(idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + auto bits = expand_bits<N>(i); + State new_state = state.next<N>(bits); + uint32_t step = new_state.normalize(); + uint32_t new_idx = repo.state_to_idx(new_state); + ASSERT_LT(new_idx, repo.size()); + if (bits[0] && idx == new_idx && step == 1) { + // omit simple transitions to yourself + } else { + fprintf(stderr, " %u -> %u [label=\"%s,%u\"];\n", idx, new_idx, + format_vector(bits, true).c_str(), step); + } + } + } + fprintf(stderr, "}\n"); +} + +TEST(TableDfaTest, graphviz_for_tfa_with_max_edits_1) { + dump_tfa_graph<1>(); +} + +TEST(TableDfaTest, graphviz_for_food_with_max_edits_1) { + auto dfa = LevenshteinDfa::build("food", 1, LevenshteinDfa::Casing::Cased, LevenshteinDfa::DfaType::Table); + std::ostringstream out; + dfa.dump_as_graphviz(out); + fprintf(stderr, "memory usage: %zu\n", dfa.memory_usage()); + fprintf(stderr, "%s", out.str().c_str()); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/tests/memorydatastore/memorydatastore.cpp b/vespalib/src/tests/memorydatastore/memorydatastore.cpp index 1d49b0af91b..7eab32601de 100644 --- a/vespalib/src/tests/memorydatastore/memorydatastore.cpp +++ b/vespalib/src/tests/memorydatastore/memorydatastore.cpp @@ -6,19 +6,9 @@ using namespace vespalib; -class MemoryDataStoreTest : public vespalib::TestApp +TEST("testMemoryDataStore") { -private: - void testMemoryDataStore(); - void testVariableSizeVector(); -public: - int Main() override; -}; - -void -MemoryDataStoreTest::testMemoryDataStore() -{ - MemoryDataStore s(alloc::Alloc::alloc(256)); + MemoryDataStore s(alloc::Alloc::alloc(256), nullptr); std::vector<MemoryDataStore::Reference> v; v.push_back(s.push_back("mumbo", 5)); for (size_t i(0); i < 50; i++) { @@ -28,45 +18,9 @@ MemoryDataStoreTest::testMemoryDataStore() v.push_back(s.push_back("mumbo", 5)); EXPECT_EQUAL(52ul, v.size()); EXPECT_NOT_EQUAL(static_cast<const char *>(v[50].data()) + 5, v[51].data()); - for (size_t i(0); i < v.size(); i++) { - EXPECT_EQUAL(0, memcmp("mumbo", v[i].data(), 5)); + for (auto & i : v) { + EXPECT_EQUAL(0, memcmp("mumbo", i.data(), 5)); } } -void -MemoryDataStoreTest::testVariableSizeVector() -{ - VariableSizeVector v(20000, 5*20000); - for (size_t i(0); i < 10000; i++) { - asciistream os; - os << i; - v.push_back(os.str().data(), os.str().size()); - } - for (size_t i(0); i < v.size(); i++) { - asciistream os; - os << i; - EXPECT_EQUAL(os.str().size(), v[i].size()); - EXPECT_EQUAL(0, memcmp(os.str().data(), v[i].data(), os.str().size())); - } - size_t i(0); - for (auto it(v.begin()), mt(v.end()); it != mt; it++, i++) { - asciistream os; - os << i; - EXPECT_EQUAL(os.str().size(), it->size()); - EXPECT_EQUAL(0, memcmp(os.str().data(), (*it).data(), os.str().size())); - } - -} - -int -MemoryDataStoreTest::Main() -{ - TEST_INIT("data_test"); - testMemoryDataStore(); - testVariableSizeVector(); - - TEST_DONE(); -} - -TEST_APPHOOK(MemoryDataStoreTest); - +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/tests/stllike/hash_test.cpp b/vespalib/src/tests/stllike/hash_test.cpp index ae27d2dc58b..dc6e48100a9 100644 --- a/vespalib/src/tests/stllike/hash_test.cpp +++ b/vespalib/src/tests/stllike/hash_test.cpp @@ -494,6 +494,14 @@ TEST("test hash set initializer list - empty") EXPECT_EQUAL(0u, s.size()); } +TEST("empty hash_set can be looked up") +{ + IntHashSet s; + EXPECT_EQUAL(0u, s.size()); + EXPECT_EQUAL(1u, s.capacity()); + EXPECT_TRUE(s.find(1) == s.end()); +} + TEST("test hash set initializer list - 1 element") { IntHashSet s = {1}; diff --git a/vespalib/src/vespa/vespalib/btree/btreeiterator.h b/vespalib/src/vespa/vespalib/btree/btreeiterator.h index 2418da18c23..9480e5880e0 100644 --- a/vespalib/src/vespa/vespalib/btree/btreeiterator.h +++ b/vespalib/src/vespa/vespalib/btree/btreeiterator.h @@ -302,22 +302,22 @@ public: /** * Get key at current iterator location. */ - const KeyType & getKey() const { return _leaf.getKey(); } + const KeyType & getKey() const noexcept { return _leaf.getKey(); } /** * Get data at current iterator location. */ - const DataType & getData() const { return _leaf.getData(); } + const DataType & getData() const noexcept { return _leaf.getData(); } /** * Check if iterator is at a valid element, i.e. not at end. */ - bool valid() const { return _leaf.valid(); } + bool valid() const noexcept{ return _leaf.valid(); } /** * Return the number of elements in the tree. */ - size_t size() const; + size_t size() const noexcept; /** @@ -333,7 +333,7 @@ public: /** * Return if the tree has data or not (e.g. keys and data or only keys). */ - static bool hasData() { return LeafNodeType::hasData(); } + static bool hasData() noexcept { return LeafNodeType::hasData(); } /** * Move the iterator directly to end. Used by findHelper method in BTree. diff --git a/vespalib/src/vespa/vespalib/btree/btreeiterator.hpp b/vespalib/src/vespa/vespalib/btree/btreeiterator.hpp index b7927feaa1a..d6dda0047ce 100644 --- a/vespalib/src/vespa/vespalib/btree/btreeiterator.hpp +++ b/vespalib/src/vespa/vespalib/btree/btreeiterator.hpp @@ -387,15 +387,13 @@ position(uint32_t levels) const res += inode->validLeaves(); for (uint32_t c = elem.getIdx(); c < slots; ++c) { BTreeNode::Ref node = inode->getChild(c); - const InternalNodeType *jnode = - _allocator->mapInternalRef(node); + const InternalNodeType *jnode = _allocator->mapInternalRef(node); res -= jnode->validLeaves(); } } else { for (uint32_t c = 0; c < elem.getIdx(); ++c) { BTreeNode::Ref node = inode->getChild(c); - const InternalNodeType *jnode = - _allocator->mapInternalRef(node); + const InternalNodeType *jnode = _allocator->mapInternalRef(node); res += jnode->validLeaves(); } } @@ -484,7 +482,7 @@ template <typename KeyT, typename DataT, typename AggrT, uint32_t INTERNAL_SLOTS, uint32_t LEAF_SLOTS, uint32_t PATH_SIZE> size_t BTreeIteratorBase<KeyT, DataT, AggrT, INTERNAL_SLOTS, LEAF_SLOTS, PATH_SIZE>:: -size() const +size() const noexcept { if (_pathSize > 0) { return _path[_pathSize - 1].getNode()->validLeaves(); diff --git a/vespalib/src/vespa/vespalib/btree/btreenode.h b/vespalib/src/vespa/vespalib/btree/btreenode.h index 0a77a0b4685..4931021d771 100644 --- a/vespalib/src/vespa/vespalib/btree/btreenode.h +++ b/vespalib/src/vespa/vespalib/btree/btreenode.h @@ -67,14 +67,14 @@ public: using Ref = datastore::EntryRef; using ChildRef = datastore::AtomicEntryRef; - bool isLeaf() const { return _level == 0u; } - bool getFrozen() const { return _isFrozen; } - void freeze() { _isFrozen = true; } - void unFreeze() { _isFrozen = false; } - void setLevel(uint8_t level) { _level = level; } - uint32_t getLevel() const { return _level; } - uint32_t validSlots() const { return _validSlots; } - void setValidSlots(uint16_t validSlots_) { _validSlots = validSlots_; } + bool isLeaf() const noexcept { return _level == 0u; } + bool getFrozen() const noexcept { return _isFrozen; } + void freeze() noexcept { _isFrozen = true; } + void unFreeze() noexcept { _isFrozen = false; } + void setLevel(uint8_t level) noexcept { _level = level; } + uint32_t getLevel() const noexcept { return _level; } + uint32_t validSlots() const noexcept { return _validSlots; } + void setValidSlots(uint16_t validSlots_) noexcept { _validSlots = validSlots_; } }; @@ -358,7 +358,7 @@ public: void insert(uint32_t idx, const KeyT & key, BTreeNode::Ref child) { insert(idx, key, BTreeNode::ChildRef(child)); } - uint32_t validLeaves() const { return _validLeaves; } + uint32_t validLeaves() const noexcept { return _validLeaves; } void setValidLeaves(uint32_t newValidLeaves) { _validLeaves = newValidLeaves; } void incValidLeaves(uint32_t delta) { _validLeaves += delta; } void decValidLeaves(uint32_t delta) { _validLeaves -= delta; } diff --git a/vespalib/src/vespa/vespalib/data/memorydatastore.cpp b/vespalib/src/vespa/vespalib/data/memorydatastore.cpp index 354787690c2..6d483e6ff4e 100644 --- a/vespalib/src/vespa/vespalib/data/memorydatastore.cpp +++ b/vespalib/src/vespa/vespalib/data/memorydatastore.cpp @@ -41,21 +41,4 @@ MemoryDataStore::push_back(const void * data, const size_t sz) return ref; } -VariableSizeVector::VariableSizeVector(size_t initialCount, size_t initialBufferSize) - : _vector(), - _store(Alloc::alloc(initialBufferSize)) -{ - _vector.reserve(initialCount); -} - -VariableSizeVector::~VariableSizeVector() = default; - -VariableSizeVector::Reference -VariableSizeVector::push_back(const void * data, const size_t sz) -{ - MemoryDataStore::Reference ptr(_store.push_back(data, sz)); - _vector.push_back(Reference(ptr.data(), sz)); - return _vector.back(); -} - } // namespace vespalib diff --git a/vespalib/src/vespa/vespalib/data/memorydatastore.h b/vespalib/src/vespa/vespalib/data/memorydatastore.h index 7022eb88051..6691211cdd0 100644 --- a/vespalib/src/vespa/vespalib/data/memorydatastore.h +++ b/vespalib/src/vespa/vespalib/data/memorydatastore.h @@ -13,18 +13,19 @@ namespace vespalib { * It has the important property that once an object has been allocated it does not move in memory. * It will start of by allocating one backing buffer and items stored will be appended here. * When limit is exceeded a new buffer is allocated with twice the size of the previous and so it goes. + * You can also provide an optional lock to make it thread safe. **/ class MemoryDataStore { public: class Reference { public: - Reference(void * data_) noexcept : _data(data_) { } + explicit Reference(void * data_) noexcept : _data(data_) { } void * data() noexcept { return _data; } const char * c_str() const noexcept { return static_cast<const char *>(_data); } private: void * _data; }; - MemoryDataStore(alloc::Alloc && initialAlloc=alloc::Alloc::alloc(256), std::mutex * lock=nullptr); + MemoryDataStore(alloc::Alloc && initialAlloc, std::mutex * lock); MemoryDataStore(const MemoryDataStore &) = delete; MemoryDataStore & operator = (const MemoryDataStore &) = delete; ~MemoryDataStore(); @@ -33,7 +34,7 @@ public: * for the lifetime of this object. * @return A pointer/reference to the freshly stored object. */ - Reference push_back(const void * data, const size_t sz); + Reference push_back(const void * data, size_t sz); void swap(MemoryDataStore & rhs) { _buffers.swap(rhs._buffers); } void clear() noexcept { _buffers.clear(); @@ -44,83 +45,5 @@ private: std::mutex * _lock; }; -class VariableSizeVector -{ -public: - class Reference { - public: - Reference(void * data_, size_t sz) noexcept : _data(data_), _sz(sz) { } - void * data() noexcept { return _data; } - const char * c_str() const noexcept { return static_cast<const char *>(_data); } - size_t size() const noexcept { return _sz; } - private: - void * _data; - size_t _sz; - }; - class iterator { - public: - iterator(vespalib::Array<Reference> & v, size_t index) noexcept : _vector(&v), _index(index) {} - Reference & operator * () const noexcept { return (*_vector)[_index]; } - Reference * operator -> () const noexcept { return &(*_vector)[_index]; } - iterator & operator ++ () noexcept { - _index++; - return *this; - } - iterator operator ++ (int) noexcept { - iterator prev = *this; - ++(*this); - return prev; - } - bool operator==(const iterator& rhs) const noexcept { return (_index == rhs._index); } - bool operator!=(const iterator& rhs) const noexcept { return (_index != rhs._index); } - private: - vespalib::Array<Reference> * _vector; - size_t _index; - }; - class const_iterator { - public: - const_iterator(const vespalib::Array<Reference> & v, size_t index) noexcept : _vector(&v), _index(index) {} - const Reference & operator * () const noexcept { return (*_vector)[_index]; } - const Reference * operator -> () const noexcept { return &(*_vector)[_index]; } - const_iterator & operator ++ () noexcept { - _index++; - return *this; - } - const_iterator operator ++ (int) noexcept { - const_iterator prev = *this; - ++(*this); - return prev; - } - bool operator==(const const_iterator& rhs) const noexcept { return (_index == rhs._index); } - bool operator!=(const const_iterator& rhs) const noexcept { return (_index != rhs._index); } - private: - const vespalib::Array<Reference> * _vector; - size_t _index; - }; - VariableSizeVector(const VariableSizeVector &) = delete; - VariableSizeVector & operator = (const VariableSizeVector &) = delete; - VariableSizeVector(size_t initialCount, size_t initialBufferSize); - ~VariableSizeVector(); - iterator begin() noexcept { return iterator(_vector, 0); } - iterator end() noexcept { return iterator(_vector, size()); } - const_iterator begin() const noexcept { return const_iterator(_vector, 0); } - const_iterator end() const noexcept { return const_iterator(_vector, size()); } - Reference push_back(const void * data, const size_t sz); - Reference operator [] (uint32_t index) const noexcept { return _vector[index]; } - size_t size() const noexcept { return _vector.size(); } - bool empty() const noexcept { return _vector.empty(); } - void swap(VariableSizeVector & rhs) noexcept { - _vector.swap(rhs._vector); - _store.swap(rhs._store); - } - void clear() { - _vector.clear(); - _store.clear(); - } -private: - vespalib::Array<Reference> _vector; - MemoryDataStore _store; -}; - } // namespace vespalib diff --git a/vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h b/vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h index 44dbf05c9da..fb4bc3943ae 100644 --- a/vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h +++ b/vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h @@ -20,7 +20,7 @@ private: const Memory &_name; public: - NamedSymbolLookup(const SymbolTable &table, const Memory &name) + NamedSymbolLookup(const SymbolTable &table, const Memory &name) noexcept : _table(table), _name(name) {} Symbol lookup() const override; }; diff --git a/vespalib/src/vespa/vespalib/data/slime/slime.cpp b/vespalib/src/vespa/vespalib/data/slime/slime.cpp index d6fac44b360..5a9ec54584d 100644 --- a/vespalib/src/vespa/vespalib/data/slime/slime.cpp +++ b/vespalib/src/vespa/vespalib/data/slime/slime.cpp @@ -6,16 +6,6 @@ namespace vespalib { -Slime::Params::Params() : Params(std::make_unique<SymbolTable>()) { } -Slime::Params::Params(std::unique_ptr<SymbolTable> symbols) noexcept : _symbols(std::move(symbols)), _chunkSize(4096) { } -Slime::Params::Params(Params &&) noexcept = default; -Slime::Params::~Params() = default; - -std::unique_ptr<slime::SymbolTable> -Slime::Params::detachSymbols() { - return std::move(_symbols); -} - Slime::Slime(Params params) : _names(params.detachSymbols()), _stash(std::make_unique<Stash>(params.getChunkSize())), @@ -33,26 +23,6 @@ Slime::reclaimSymbols(Slime &&rhs) { return std::move(rhs._names); } -size_t -Slime::symbols() const noexcept { - return _names->symbols(); -} - -Memory -Slime::inspect(Symbol symbol) const { - return _names->inspect(symbol); -} - -slime::Symbol -Slime::insert(Memory name) { - return _names->insert(name); -} - -slime::Symbol -Slime::lookup(Memory name) const { - return _names->lookup(name); -} - bool operator == (const Slime & a, const Slime & b) noexcept { return a.get() == b.get(); diff --git a/vespalib/src/vespa/vespalib/data/slime/slime.h b/vespalib/src/vespa/vespalib/data/slime/slime.h index a426f906563..4b789838009 100644 --- a/vespalib/src/vespa/vespalib/data/slime/slime.h +++ b/vespalib/src/vespa/vespalib/data/slime/slime.h @@ -21,6 +21,7 @@ #include "symbol.h" #include "symbol_inserter.h" #include "symbol_lookup.h" +#include "symbol_table.h" #include "type.h" #include "value.h" #include "value_factory.h" @@ -51,32 +52,30 @@ private: using Cursor = slime::Cursor; using Inspector = slime::Inspector; - std::unique_ptr<SymbolTable> _names; - std::unique_ptr<Stash> _stash; - RootValue _root; + std::unique_ptr<SymbolTable> _names; + std::unique_ptr<Stash> _stash; + RootValue _root; public: using UP = std::unique_ptr<Slime>; class Params { private: - std::unique_ptr<SymbolTable> _symbols; - size_t _chunkSize; + std::unique_ptr<SymbolTable> _symbols; + size_t _chunkSize; public: - Params(); - explicit Params(std::unique_ptr<SymbolTable> symbols) noexcept; - Params(Params &&) noexcept; - ~Params(); - Params & setChunkSize(size_t chunkSize) { - _chunkSize = chunkSize; - return *this; - } - size_t getChunkSize() const { return _chunkSize; } - std::unique_ptr<SymbolTable> detachSymbols(); + Params() : Params(4096) {} + explicit Params(size_t chunkSize) : _symbols(std::make_unique<SymbolTable>()), _chunkSize(chunkSize) {} + explicit Params(std::unique_ptr<SymbolTable> symbols) noexcept : _symbols(std::move(symbols)), _chunkSize(4096) {} + Params(Params &&) noexcept = default; + ~Params() = default; + size_t getChunkSize() const noexcept { return _chunkSize; } + std::unique_ptr<SymbolTable> detachSymbols() noexcept { return std::move(_symbols); } }; /** * Construct an initially empty Slime object. **/ - explicit Slime(Params params = Params()); + explicit Slime() : Slime(Params()) {} + explicit Slime(Params params); ~Slime(); @@ -88,13 +87,13 @@ public: static std::unique_ptr<SymbolTable> reclaimSymbols(Slime &&rhs); - size_t symbols() const noexcept; + size_t symbols() const noexcept { return _names->symbols(); } - Memory inspect(Symbol symbol) const; + Memory inspect(Symbol symbol) const { return _names->inspect(symbol); } - Symbol insert(Memory name); + Symbol insert(Memory name) { return _names->insert(name); } - Symbol lookup(Memory name) const; + Symbol lookup(Memory name) const { return _names->lookup(name); } Cursor &get() noexcept { return _root.get(); } diff --git a/vespalib/src/vespa/vespalib/data/slime/symbol.h b/vespalib/src/vespa/vespalib/data/slime/symbol.h index 3bce727fad9..a60a49fda27 100644 --- a/vespalib/src/vespa/vespalib/data/slime/symbol.h +++ b/vespalib/src/vespa/vespalib/data/slime/symbol.h @@ -19,8 +19,8 @@ private: public: Symbol() noexcept : _value(UNDEFINED) {} Symbol(uint32_t v) noexcept : _value(v) {} - bool undefined() const { return (_value == UNDEFINED); } - uint32_t getValue() const { return _value; } + bool undefined() const noexcept { return (_value == UNDEFINED); } + uint32_t getValue() const noexcept { return _value; } bool operator<(const Symbol &rhs) const noexcept { return (_value < rhs._value); } bool operator==(const Symbol &rhs) const noexcept { return (_value == rhs._value); } }; diff --git a/vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp b/vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp index a3313516c64..dffe35707fc 100644 --- a/vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp +++ b/vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp @@ -5,10 +5,13 @@ namespace vespalib::slime { -SymbolTable::SymbolTable(size_t expectedNumSymbols) : - _symbols(3*expectedNumSymbols), - _names(expectedNumSymbols, expectedNumSymbols*16) -{ } +SymbolTable::SymbolTable(size_t expectedNumSymbols) + : _stash(), + _symbols(3*expectedNumSymbols), + _names() +{ + _names.reserve(expectedNumSymbols); +} SymbolTable::~SymbolTable() = default; @@ -16,6 +19,7 @@ void SymbolTable::clear() { _names.clear(); _symbols.clear(); + _stash.clear(); } Symbol @@ -23,17 +27,21 @@ SymbolTable::insert(const Memory &name) { SymbolMap::const_iterator pos = _symbols.find(name); if (pos == _symbols.end()) { Symbol symbol(_names.size()); - SymbolVector::Reference r(_names.push_back(name.data, name.size)); - _symbols.insert(std::make_pair(Memory(r.c_str(), r.size()), symbol)); + char *buf = _stash.alloc(name.size); + memcpy(buf, name.data, name.size); + Memory backed(buf, name.size); + _names.push_back(backed); + _symbols.insert(std::make_pair(backed, symbol)); return symbol; } return pos->second; } + Symbol SymbolTable::lookup(const Memory &name) const { SymbolMap::const_iterator pos = _symbols.find(name); if (pos == _symbols.end()) { - return Symbol(); + return {}; } return pos->second; } diff --git a/vespalib/src/vespa/vespalib/data/slime/symbol_table.h b/vespalib/src/vespa/vespalib/data/slime/symbol_table.h index c5f3cf12fd6..0eae65cead0 100644 --- a/vespalib/src/vespa/vespalib/data/slime/symbol_table.h +++ b/vespalib/src/vespa/vespalib/data/slime/symbol_table.h @@ -4,8 +4,8 @@ #include "symbol.h" #include <vespa/vespalib/data/memory.h> +#include <vespa/vespalib/util/stash.h> #include <vespa/vespalib/stllike/hash_map.h> -#include <vespa/vespalib/data/memorydatastore.h> namespace vespalib::slime { @@ -21,21 +21,24 @@ private: } }; using SymbolMap = hash_map<Memory, Symbol, hasher>; - using SymbolVector = VariableSizeVector; + using SymbolVector = std::vector<Memory>; + Stash _stash; SymbolMap _symbols; SymbolVector _names; public: using UP = std::unique_ptr<SymbolTable>; - SymbolTable(size_t expectedNumSymbols=16); + SymbolTable() : SymbolTable(16) {} + explicit SymbolTable(size_t expectedNumSymbols); + SymbolTable(SymbolTable &&) noexcept = default; + SymbolTable & operator=(SymbolTable &&) noexcept = default; ~SymbolTable(); size_t symbols() const noexcept { return _names.size(); } Memory inspect(const Symbol &symbol) const { if (symbol.getValue() > _names.size()) { return Memory(); } - SymbolVector::Reference r(_names[symbol.getValue()]); - return Memory(r.c_str(), r.size()); + return _names[symbol.getValue()]; } Symbol insert(const Memory &name); Symbol lookup(const Memory &name) const; diff --git a/vespalib/src/vespa/vespalib/datastore/compaction_strategy.h b/vespalib/src/vespa/vespalib/datastore/compaction_strategy.h index dd41ed244ae..d3cdc174339 100644 --- a/vespalib/src/vespa/vespalib/datastore/compaction_strategy.h +++ b/vespalib/src/vespa/vespalib/datastore/compaction_strategy.h @@ -2,8 +2,9 @@ #pragma once -#include <iosfwd> +#include <cstddef> #include <cstdint> +#include <iosfwd> namespace vespalib { diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt index 5e8d29980cd..8ccef84d969 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt @@ -7,6 +7,7 @@ vespa_add_library(vespalib_vespalib_fuzzy OBJECT implicit_levenshtein_dfa.cpp levenshtein_dfa.cpp levenshtein_distance.cpp + table_dfa.cpp unicode_utils.cpp DEPENDS ) diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp index 826b0beffd6..d87afef1fbe 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp @@ -9,6 +9,7 @@ namespace { const vespalib::string brute_force = "brute_force"; const vespalib::string dfa_implicit = "dfa_implicit"; const vespalib::string dfa_explicit = "dfa_explicit"; +const vespalib::string dfa_table = "dfa_table"; } @@ -22,6 +23,8 @@ to_string(FuzzyMatchingAlgorithm algo) return dfa_implicit; case FuzzyMatchingAlgorithm::DfaExplicit: return dfa_explicit; + case FuzzyMatchingAlgorithm::DfaTable: + return dfa_table; default: return ""; } @@ -37,6 +40,8 @@ fuzzy_matching_algorithm_from_string(const vespalib::string& algo, return FuzzyMatchingAlgorithm::DfaImplicit; } else if (algo == dfa_explicit) { return FuzzyMatchingAlgorithm::DfaExplicit; + } else if (algo == dfa_table) { + return FuzzyMatchingAlgorithm::DfaTable; } return default_algo; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h index 83cb121fe5f..9af94e84b89 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h @@ -13,7 +13,8 @@ namespace vespalib { enum class FuzzyMatchingAlgorithm { BruteForce, DfaImplicit, - DfaExplicit + DfaExplicit, + DfaTable }; vespalib::string to_string(FuzzyMatchingAlgorithm algo); diff --git a/vespalib/src/vespa/vespalib/fuzzy/inline_tfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/inline_tfa.hpp new file mode 100644 index 00000000000..2475704ec06 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/inline_tfa.hpp @@ -0,0 +1,91 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// start of auto-generated code for N = 1 +template <> struct InlineTfa<1> { + static constexpr Transition table[6][8] = { + {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}}, + {{0,2},{0,2},{0,3},{0,3},{1,1},{1,1},{1,1},{1,1}}, + {{0,0},{0,0},{2,4},{2,4},{1,4},{1,4},{1,2},{1,2}}, + {{0,0},{3,4},{2,4},{2,2},{1,4},{1,5},{1,2},{1,3}}, + {{0,0},{0,0},{0,0},{0,0},{1,4},{1,4},{1,4},{1,4}}, + {{0,0},{3,4},{0,0},{3,4},{1,4},{1,5},{1,4},{1,5}} + }; + static constexpr uint8_t edits[6][3] = { + {2,2,2}, + {0,1,2}, + {1,1,2}, + {1,1,1}, + {1,2,2}, + {1,2,1} + }; +}; +// end of auto-generated code for N = 1 +// start of auto-generated code for N = 2 +template <> struct InlineTfa<2> { + static constexpr Transition table[31][32] = { + {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}}, + {{0,2},{0,2},{0,2},{0,2},{0,3},{0,3},{0,3},{0,3},{0,4},{0,4},{0,4},{0,4},{0,4},{0,4},{0,4},{0,4},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}}, + {{0,5},{0,5},{0,5},{0,5},{0,6},{0,6},{0,6},{0,6},{0,7},{0,7},{0,7},{0,7},{0,7},{0,7},{0,7},{0,7},{1,8},{1,8},{1,8},{1,8},{1,9},{1,9},{1,9},{1,9},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2}}, + {{0,5},{0,5},{0,10},{0,10},{0,6},{0,6},{0,11},{0,11},{0,7},{0,7},{0,12},{0,12},{0,7},{0,7},{0,12},{0,12},{1,8},{1,8},{1,13},{1,13},{1,9},{1,9},{1,14},{1,14},{1,2},{1,2},{1,3},{1,3},{1,2},{1,2},{1,3},{1,3}}, + {{0,6},{0,6},{0,11},{0,11},{0,15},{0,15},{0,15},{0,15},{0,7},{0,7},{0,12},{0,12},{0,16},{0,16},{0,16},{0,16},{1,9},{1,9},{1,14},{1,14},{1,17},{1,17},{1,17},{1,17},{1,2},{1,2},{1,3},{1,3},{1,4},{1,4},{1,4},{1,4}}, + {{0,0},{0,0},{0,0},{0,0},{3,18},{3,18},{3,18},{3,18},{2,18},{2,18},{2,18},{2,18},{2,19},{2,19},{2,19},{2,19},{1,18},{1,18},{1,18},{1,18},{1,20},{1,20},{1,20},{1,20},{1,19},{1,19},{1,19},{1,19},{1,5},{1,5},{1,5},{1,5}}, + {{0,0},{0,0},{4,18},{4,18},{3,18},{3,18},{3,19},{3,19},{2,18},{2,18},{2,20},{2,20},{2,19},{2,19},{2,5},{2,5},{1,18},{1,18},{1,21},{1,21},{1,20},{1,20},{1,22},{1,22},{1,19},{1,19},{1,23},{1,23},{1,5},{1,5},{1,6},{1,6}}, + {{2,19},{2,19},{2,5},{2,5},{3,8},{3,8},{3,8},{3,8},{2,19},{2,19},{2,5},{2,5},{3,8},{3,8},{3,8},{3,8},{1,5},{1,5},{1,6},{1,6},{1,7},{1,7},{1,7},{1,7},{1,5},{1,5},{1,6},{1,6},{1,7},{1,7},{1,7},{1,7}}, + {{0,19},{0,19},{0,19},{0,19},{0,19},{0,19},{0,19},{0,19},{0,5},{0,5},{0,5},{0,5},{0,5},{0,5},{0,5},{0,5},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8}}, + {{0,19},{0,19},{0,19},{0,19},{0,23},{0,23},{0,23},{0,23},{0,5},{0,5},{0,5},{0,5},{0,6},{0,6},{0,6},{0,6},{1,8},{1,8},{1,8},{1,8},{1,9},{1,9},{1,9},{1,9},{1,8},{1,8},{1,8},{1,8},{1,9},{1,9},{1,9},{1,9}}, + {{0,0},{5,18},{0,0},{5,18},{3,18},{3,20},{3,18},{3,20},{2,18},{2,21},{2,18},{2,21},{2,19},{2,23},{2,19},{2,23},{1,18},{1,24},{1,18},{1,24},{1,20},{1,25},{1,20},{1,25},{1,19},{1,26},{1,19},{1,26},{1,5},{1,10},{1,5},{1,10}}, + {{0,0},{5,18},{4,18},{4,19},{3,18},{3,20},{3,19},{3,5},{2,18},{2,21},{2,20},{2,22},{2,19},{2,23},{2,5},{2,6},{1,18},{1,24},{1,21},{1,27},{1,20},{1,25},{1,22},{1,28},{1,19},{1,26},{1,23},{1,29},{1,5},{1,10},{1,6},{1,11}}, + {{2,19},{2,23},{2,5},{2,6},{3,8},{3,9},{3,8},{3,9},{2,19},{2,23},{2,5},{2,6},{3,8},{3,9},{3,8},{3,9},{1,5},{1,10},{1,6},{1,11},{1,7},{1,12},{1,7},{1,12},{1,5},{1,10},{1,6},{1,11},{1,7},{1,12},{1,7},{1,12}}, + {{0,19},{0,19},{0,26},{0,26},{0,19},{0,19},{0,26},{0,26},{0,5},{0,5},{0,10},{0,10},{0,5},{0,5},{0,10},{0,10},{1,8},{1,8},{1,13},{1,13},{1,8},{1,8},{1,13},{1,13},{1,8},{1,8},{1,13},{1,13},{1,8},{1,8},{1,13},{1,13}}, + {{0,19},{0,19},{0,26},{0,26},{0,23},{0,23},{0,29},{0,29},{0,5},{0,5},{0,10},{0,10},{0,6},{0,6},{0,11},{0,11},{1,8},{1,8},{1,13},{1,13},{1,9},{1,9},{1,14},{1,14},{1,8},{1,8},{1,13},{1,13},{1,9},{1,9},{1,14},{1,14}}, + {{3,19},{3,5},{4,8},{4,8},{3,19},{3,5},{4,8},{4,8},{2,5},{2,6},{2,7},{2,7},{2,5},{2,6},{2,7},{2,7},{1,22},{1,28},{1,30},{1,30},{1,22},{1,28},{1,30},{1,30},{1,6},{1,11},{1,15},{1,15},{1,6},{1,11},{1,15},{1,15}}, + {{2,5},{2,6},{2,7},{2,7},{3,8},{3,9},{3,2},{3,2},{2,5},{2,6},{2,7},{2,7},{3,8},{3,9},{3,2},{3,2},{1,6},{1,11},{1,15},{1,15},{1,7},{1,12},{1,16},{1,16},{1,6},{1,11},{1,15},{1,15},{1,7},{1,12},{1,16},{1,16}}, + {{0,6},{0,6},{0,11},{0,11},{0,15},{0,15},{0,15},{0,15},{0,6},{0,6},{0,11},{0,11},{0,15},{0,15},{0,15},{0,15},{1,9},{1,9},{1,14},{1,14},{1,17},{1,17},{1,17},{1,17},{1,9},{1,9},{1,14},{1,14},{1,17},{1,17},{1,17},{1,17}}, + {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18}}, + {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{2,18},{2,18},{2,18},{2,18},{2,18},{2,18},{2,18},{2,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,19},{1,19},{1,19},{1,19},{1,19},{1,19},{1,19},{1,19}}, + {{0,0},{0,0},{0,0},{0,0},{3,18},{3,18},{3,18},{3,18},{0,0},{0,0},{0,0},{0,0},{3,18},{3,18},{3,18},{3,18},{1,18},{1,18},{1,18},{1,18},{1,20},{1,20},{1,20},{1,20},{1,18},{1,18},{1,18},{1,18},{1,20},{1,20},{1,20},{1,20}}, + {{0,0},{0,0},{4,18},{4,18},{0,0},{0,0},{4,18},{4,18},{0,0},{0,0},{4,18},{4,18},{0,0},{0,0},{4,18},{4,18},{1,18},{1,18},{1,21},{1,21},{1,18},{1,18},{1,21},{1,21},{1,18},{1,18},{1,21},{1,21},{1,18},{1,18},{1,21},{1,21}}, + {{0,0},{0,0},{4,18},{4,18},{3,18},{3,18},{3,19},{3,19},{0,0},{0,0},{4,18},{4,18},{3,18},{3,18},{3,19},{3,19},{1,18},{1,18},{1,21},{1,21},{1,20},{1,20},{1,22},{1,22},{1,18},{1,18},{1,21},{1,21},{1,20},{1,20},{1,22},{1,22}}, + {{0,0},{0,0},{4,18},{4,18},{0,0},{0,0},{4,18},{4,18},{2,18},{2,18},{2,20},{2,20},{2,18},{2,18},{2,20},{2,20},{1,18},{1,18},{1,21},{1,21},{1,18},{1,18},{1,21},{1,21},{1,19},{1,19},{1,23},{1,23},{1,19},{1,19},{1,23},{1,23}}, + {{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24}}, + {{0,0},{5,18},{0,0},{5,18},{3,18},{3,20},{3,18},{3,20},{0,0},{5,18},{0,0},{5,18},{3,18},{3,20},{3,18},{3,20},{1,18},{1,24},{1,18},{1,24},{1,20},{1,25},{1,20},{1,25},{1,18},{1,24},{1,18},{1,24},{1,20},{1,25},{1,20},{1,25}}, + {{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{2,18},{2,21},{2,18},{2,21},{2,18},{2,21},{2,18},{2,21},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,19},{1,26},{1,19},{1,26},{1,19},{1,26},{1,19},{1,26}}, + {{0,0},{5,18},{4,18},{4,19},{0,0},{5,18},{4,18},{4,19},{0,0},{5,18},{4,18},{4,19},{0,0},{5,18},{4,18},{4,19},{1,18},{1,24},{1,21},{1,27},{1,18},{1,24},{1,21},{1,27},{1,18},{1,24},{1,21},{1,27},{1,18},{1,24},{1,21},{1,27}}, + {{0,0},{5,18},{4,18},{4,19},{3,18},{3,20},{3,19},{3,5},{0,0},{5,18},{4,18},{4,19},{3,18},{3,20},{3,19},{3,5},{1,18},{1,24},{1,21},{1,27},{1,20},{1,25},{1,22},{1,28},{1,18},{1,24},{1,21},{1,27},{1,20},{1,25},{1,22},{1,28}}, + {{0,0},{5,18},{4,18},{4,19},{0,0},{5,18},{4,18},{4,19},{2,18},{2,21},{2,20},{2,22},{2,18},{2,21},{2,20},{2,22},{1,18},{1,24},{1,21},{1,27},{1,18},{1,24},{1,21},{1,27},{1,19},{1,26},{1,23},{1,29},{1,19},{1,26},{1,23},{1,29}}, + {{3,19},{3,5},{4,8},{4,8},{3,19},{3,5},{4,8},{4,8},{3,19},{3,5},{4,8},{4,8},{3,19},{3,5},{4,8},{4,8},{1,22},{1,28},{1,30},{1,30},{1,22},{1,28},{1,30},{1,30},{1,22},{1,28},{1,30},{1,30},{1,22},{1,28},{1,30},{1,30}} + }; + static constexpr uint8_t edits[31][5] = { + {3,3,3,3,3}, + {0,1,2,3,3}, + {1,1,2,3,3}, + {1,1,2,2,3}, + {1,1,1,2,3}, + {2,2,2,3,3}, + {2,2,2,2,3}, + {2,2,1,2,3}, + {1,2,3,3,3}, + {1,2,2,3,3}, + {2,2,2,3,2}, + {2,2,2,2,2}, + {2,2,1,2,2}, + {1,2,3,2,3}, + {1,2,2,2,3}, + {2,2,2,1,2}, + {2,2,1,1,2}, + {1,2,1,2,3}, + {2,3,3,3,3}, + {2,2,3,3,3}, + {2,3,2,3,3}, + {2,3,3,2,3}, + {2,3,2,2,3}, + {2,2,3,2,3}, + {2,3,3,3,2}, + {2,3,2,3,2}, + {2,2,3,3,2}, + {2,3,3,2,2}, + {2,3,2,2,2}, + {2,2,3,2,2}, + {2,3,2,1,2} + }; +}; +// end of auto-generated code for N = 2 diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp index 1caae408176..5f6d0ae9956 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "explicit_levenshtein_dfa.h" #include "implicit_levenshtein_dfa.h" +#include "table_dfa.h" #include "levenshtein_dfa.h" #include "unicode_utils.h" #include <vespa/vespalib/util/stringfmt.h> @@ -53,14 +54,19 @@ LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max } else { // max_edits == 2 return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased)); } - } else { // DfaType::Explicit + } else if(dfa_type == DfaType::Explicit) { if (max_edits == 1) { return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa(); } else { // max_edits == 2 return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa(); } + } else { // DfaType::Table + if (max_edits == 1) { + return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased)); + } else { // max_edits == 2 + return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased)); + } } - } LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) { @@ -87,9 +93,11 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mo std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) { if (dt == LevenshteinDfa::DfaType::Implicit) { os << "Implicit"; - } else { - assert(dt == LevenshteinDfa::DfaType::Explicit); + } else if (dt == LevenshteinDfa::DfaType::Explicit) { os << "Explicit"; + } else { + assert(dt == LevenshteinDfa::DfaType::Table); + os << "Table"; } return os; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h index c6ca06d4de3..1652631e968 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -58,8 +58,8 @@ namespace vespalib::fuzzy { * ====== Unicode support ====== * * Matching and successor generation is fully Unicode-aware. All input strings are expected - * to be in UTF-8, and the generated successor is also encoded as UTF-8 (with some caveats; - * see the documentation for match()). + * to be in UTF-8, and the generated successor is encoded as UTF-8 (with some caveats; see + * the documentation for match()) or UTF-32, depending on the chosen `match()` overload. * * Internally, matching is done on UTF-32 code points and the DFA itself is built around * UTF-32. This is unlike Lucene, which converts a UTF-32 DFA to an equivalent UTF-8 DFA. @@ -159,7 +159,7 @@ public: /** * Attempts to match the source string `source` with the target string this DFA was - * built with, emitting a successor string on mismatch if `successor_out` != nullptr. + * built with. * * `source` must not contain any null UTF-8 chars. * @@ -179,13 +179,23 @@ public: * Attempts to match the source string `source` with the target string this DFA was * built with, emitting a successor string into `successor_out` on mismatch. * - * See `match(source)` for semantics of returned MatchResult. + * In the case of a _match_, the following holds: + * + * - The returned MatchResult has the same semantics as `match(source)`. + * - `successor_out` has a _prefix_ equal to its value that was originally passed + * in at the time of match() being called. The _suffix_ of the string is unspecified, + * i.e. it may or may not have been modified. * * In the case of a _mismatch_, the following holds: * - * - `successor_out` is modified to contain the next (in byte-wise ordering) possible - * _matching_ string S so that there exists no other matching string S' that is - * greater than `source` but smaller than S. + * - `successor_out` has a _prefix_ equal to its value that was originally passed + * in at the time of match() being called. + * - `successor_out` has a _suffix_ that contains the next (in byte-wise ordering) + * possible _matching_ string S so that there exists no other matching string S' + * that is greater than `source` but smaller than S. + * The caller must explicitly be aware of any prefixes it sends in, as it is + * entirely ignored for the purposes of ordering the successor string vis-a-vis + * the input source string. * - `successor_out` contains UTF-8 bytes that are within what UTF-8 can legally * encode in bitwise form, but the _code points_ they encode may not be valid. * In particular, surrogate pair ranges and U+10FFFF+1 may be encoded, neither of @@ -203,12 +213,8 @@ public: * is what is passed to the DFA match() function. * * Memory allocation: - * This function does not directly or indirectly allocate any heap memory if either: - * - * - the input string is within the max edit distance, or - * - `successor_out` is nullptr, or - * - `successor_out` has sufficient capacity to hold the generated successor - * + * This function does not directly or indirectly allocate any heap memory if the + * `successor_out` string provided is large enough to fit any generated successor. * By reusing the successor string across many calls, this therefore amortizes memory * allocations down to near zero per invocation. */ @@ -220,7 +226,8 @@ public: * internally, and is therefore expected to be more efficient. * * The code point ordering of the UTF-32 successor string is identical to that its UTF-8 - * equivalent. + * equivalent. This includes the special cases where the successor may contain code points + * outside the legal Unicode range. */ [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const; @@ -231,7 +238,8 @@ public: enum class DfaType { Implicit, - Explicit + Explicit, + Table }; /** diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp index fb5ec32abc7..9f5dc0fb0c0 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp @@ -136,15 +136,14 @@ struct MatchAlgorithm { * that has nothing in common with the source altogether. * Example: "gp" -> "hfood" (+1 char value case) * - * Performance note: - * Both the input and successor output strings are in UTF-8 format. To avoid doing - * duplicate work, we keep track of the byte length of the string prefix that will be - * part of the successor and simply copy it verbatim instead of building the string - * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used - * when one or more of the prefix characters have been lowercase-transformed. + * Note for cased vs. uncased matching: when uncased matching is specified, we always + * match "as if" both the target and source strings are lowercased. This means that + * successor strings are generated based on this form, _not_ on the original form. + * Example: uncased matching for target "food" with input "FOXX". This generates the + * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different + * ordering when compared byte-wise against an implicitly lowercased dictionary. * * TODO let matcher know if source string is pre-normalized (i.e. lowercased). - * TODO consider opportunistically appending prefix as we go instead of only when needed. */ template <DfaMatcher Matcher, typename SuccessorT> static MatchResult match(const Matcher& matcher, @@ -153,22 +152,19 @@ struct MatchAlgorithm { { using StateType = typename Matcher::StateType; Utf8Reader u8_reader(source.data(), source.size()); - uint32_t n_prefix_u8_bytes = 0; + uint32_t n_prefix_chars = static_cast<uint32_t>(successor_out.size()); // Don't touch any existing prefix uint32_t char_after_prefix = 0; StateType last_state_with_higher_out = StateType{}; - bool can_use_raw_prefix = true; StateType state = matcher.start(); while (u8_reader.hasMore()) { - const auto u8_pos_before_char = u8_reader.getPos(); - const uint32_t raw_mch = u8_reader.getChar(); - const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased()); - if (raw_mch != mch) { - can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix - } + const auto pos_before_char = static_cast<uint32_t>(successor_out.size()); + const uint32_t raw_mch = u8_reader.getChar(); + const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased()); + append_utf32_char(successor_out, mch); if (matcher.has_higher_out_edge(state, mch)) { last_state_with_higher_out = state; - n_prefix_u8_bytes = u8_pos_before_char; + n_prefix_chars = pos_before_char; char_after_prefix = mch; } auto maybe_next = matcher.match_input(state, mch); @@ -176,8 +172,7 @@ struct MatchAlgorithm { state = maybe_next; } else { // Can never match; find the successor - emit_successor_prefix(successor_out, source, n_prefix_u8_bytes, - matcher.is_cased() || can_use_raw_prefix); + successor_out.resize(n_prefix_chars); // Always <= successor_out.size() assert(matcher.valid_state(last_state_with_higher_out)); backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out, char_after_prefix, successor_out); @@ -188,8 +183,7 @@ struct MatchAlgorithm { if (edits <= max_edits()) { return MatchResult::make_match(max_edits(), edits); } - emit_successor_prefix(successor_out, source, source.size(), - matcher.is_cased() || can_use_raw_prefix); + // Successor prefix already filled, just need to emit the suffix emit_smallest_matching_suffix(matcher, state, successor_out); return MatchResult::make_mismatch(max_edits()); } @@ -320,48 +314,6 @@ struct MatchAlgorithm { } } - template <typename T> - static constexpr bool has_8bit_value_type() noexcept { - return sizeof(typename T::value_type) == 1; - } - - /** - * The successor prefix is the prefix of the source string up to (but not including) the - * point where we emit a lexicographically higher character. Ideally we can just copy the - * UTF-8 bytes verbatim from the source into the successor. This is possible when one of - * the following holds: - * - * - DFA uses Cased (i.e. exact) matching, or - * - DFA uses Uncased, but none of the characters in the prefix triggered a lowercase - * transform. This means the prefix is already as-if lowercased, and we can copy it - * verbatim. - * - * In the case that we can't copy verbatim, we currently have to explicitly normalize the - * prefix by converting it to its lowercased form. - * - * Example: Uncased matching for target "food" with input "FOXX". This generates the - * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different - * ordering when compared byte-wise against an implicitly lowercased dictionary. - */ - template <typename SuccessorT> - static void emit_successor_prefix(SuccessorT& successor_out, std::string_view source, - uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes) - { - // TODO redesign prefix output wiring - if constexpr (has_8bit_value_type<SuccessorT>()) { - if (emit_raw_prefix_u8_bytes) { - successor_out = source.substr(0, n_prefix_u8_bytes); - return; - } - } - // TODO avoid duplicate work...! :I - successor_out.clear(); - Utf8Reader u8_reader(source.data(), source.size()); - while (u8_reader.getPos() < n_prefix_u8_bytes) { - append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar())); - } - } - static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept { return (is_cased ? in_ch : LowerCase::convert(in_ch)); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h index d20cfc07a9a..dfec0bac4a8 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h +++ b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h @@ -112,11 +112,11 @@ std::ostream& operator<<(std::ostream& os, const FixedSparseState<MaxEdits>& s) if (i != 0) { os << ", "; } - for (size_t j = last_idx; j < s.indices[i]; ++j) { + for (size_t j = last_idx; j < s.index(i); ++j) { os << "-, "; } - last_idx = s.indices[i] + 1; - os << static_cast<uint32_t>(s.costs[i]); + last_idx = s.index(i) + 1; + os << static_cast<uint32_t>(s.cost(i)); } os << "]"; return os; diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp new file mode 100644 index 00000000000..943349818fb --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp @@ -0,0 +1,10 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "table_dfa.hpp" + +namespace vespalib::fuzzy { + +template class TableDfa<1>; +template class TableDfa<2>; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h new file mode 100644 index 00000000000..45c52c84bad --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h @@ -0,0 +1,63 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "levenshtein_dfa.h" +#include <vector> + +namespace vespalib::fuzzy { + +/** + * This implementation is based on the paper 'Fast string correction + * with Levenshtein automata' from 2002 by Klaus U. Schulz and Stoyan + * Mihov. + * + * Given the maximal distance N, a generic parameterized transition + * table is calculated up-front. When a specific word is given, a + * simple lookup structure is created to enumerate the possible + * characteristic vectors for each position in the given + * word. Together, these structures can be used to simulate the + * traversal of a hypothetical Levenshtein dfa that will never be + * created. + * + * Approaching the end of the word is handled by padding the + * characteristic vectors with 0 bits for everything after the word + * ends. In addition, a unit test verifies that there is no possible + * sequence of events that leads to the minimal boundary of the state + * exceeding the boundary of the word itself. This means that the + * simulated dfa can be stepped freely without checking for word size. + **/ +template <uint8_t N> +class TableDfa final : public LevenshteinDfa::Impl +{ +public: + // characteristic vector for a specific input value indicating how + // it matches the window starting at the minimal boundary. + struct CV { + uint32_t input; + uint32_t match; + CV() noexcept : input(0), match(0) {} + }; + static constexpr size_t window_size() { return 2 * N + 1; } + struct Lookup { + std::array<CV, window_size()> list; + Lookup() noexcept : list() {} + }; + +private: + const std::vector<Lookup> _lookup; + const bool _is_cased; + + static std::vector<Lookup> make_lookup(const std::vector<uint32_t> &str); + +public: + using MatchResult = LevenshteinDfa::MatchResult; + TableDfa(std::vector<uint32_t> str, bool is_cased); + ~TableDfa() override; + [[nodiscard]] MatchResult match(std::string_view source) const override; + [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override; + [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const override; + [[nodiscard]] size_t memory_usage() const noexcept override; + void dump_as_graphviz(std::ostream& os) const override; +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp new file mode 100644 index 00000000000..de850681113 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp @@ -0,0 +1,586 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "table_dfa.h" +#include "match_algorithm.hpp" +#include <vespa/vespalib/util/stringfmt.h> +#include <cassert> +#include <stdexcept> +#include <algorithm> +#include <map> +#include <ostream> +#include <set> +#include <queue> + +namespace vespalib::fuzzy { + +namespace { + +using vespalib::make_string_short::fmt; + +// It is useful to know the number of states compile time to be able +// to pack lookup tables better. +template <uint8_t N> constexpr size_t num_states(); +template <> constexpr size_t num_states<1>() { return 6; } +template <> constexpr size_t num_states<2>() { return 31; } +template <> constexpr size_t num_states<3>() { return 197; } + +template <uint8_t N> constexpr size_t window_size() { return 2 * N + 1; } +template <uint8_t N> constexpr size_t num_transitions() { return 1 << window_size<N>(); } + + +auto diff(auto a, auto b) { return (a > b) ? (a - b) : (b - a); } + +// A Position combines an index into a word being matched with the +// number of edits needed to get there. This maps directly onto a +// specific state in the NFA used to match a word. Note that the sort +// order prefers low edits over low indexs. This is to ensure that a +// position that subsumes another position will always sort before it. +struct Position { + uint32_t index; + uint32_t edits; + Position(uint32_t index_in, uint32_t edits_in) noexcept + : index(index_in), edits(edits_in) {} + static Position start() noexcept { return Position(0,0); } + bool subsumes(const Position &rhs) const noexcept { + if (edits >= rhs.edits) { + return false; + } + return diff(index, rhs.index) <= (rhs.edits - edits); + } + Position materialize(uint32_t target_index) const noexcept { + return Position(target_index, edits + diff(index, target_index)); + } + bool operator==(const Position &rhs) const noexcept { + return (index == rhs.index) && (edits == rhs.edits); + } + bool operator<(const Position &rhs) const noexcept { + return std::tie(edits,index) < std::tie(rhs.edits, rhs.index); + } + template <uint8_t N> + void add_elementary_transitions(const std::vector<bool> &bits, std::vector<Position> &dst) const { + assert(bits.size() > index); + if (!bits[index]) { + dst.emplace_back(index, edits + 1); + dst.emplace_back(index + 1, edits + 1); + } + for (uint32_t e = 0; (edits + e) <= N; ++e) { + assert(bits.size() > (index + e)); + if (bits[index + e]) { + dst.emplace_back(index + e + 1, edits + e); + } + } + } + vespalib::string to_string() const { return fmt("%u#%u", index, edits); } +}; + +// A State is a collection of different Positions that do not subsume +// each other. If the minimal boundary of a state is larger than 0, it +// can be lifted from the state in a normalizing operation that will +// renumber the position indexes such that the minimal boundary of the +// state becomes 0. This is to allow parameterized states where the +// general progress of matching the string (minimal boundary of +// non-normalized state) is untangled from the local competing edit +// alternatives (normalized state). +struct State { + std::vector<Position> list; + State() noexcept : list() {} + static State failed() noexcept { return State(); } + static State start() { + State result; + result.list.push_back(Position::start()); + return result; + } + bool operator<(const State &rhs) const { + return list < rhs.list; + } + uint32_t minimal_boundary() const noexcept { + if (list.empty()) { + return 0; + } + uint32_t min = list[0].index; + for (size_t i = 1; i < list.size(); ++i) { + min = std::min(min, list[i].index); + } + return min; + } + uint32_t normalize() { + uint32_t min = minimal_boundary(); + if (min > 0) { + for (auto &entry: list) { + entry.index -= min; + } + } + return min; + } + template <uint8_t N> + static State create(std::vector<Position> list_in) { + State result; + auto want = [&result](Position pos) { + if (pos.edits > N) { + return false; + } + for (const auto &old_pos: result.list) { + if (old_pos == pos || old_pos.subsumes(pos)) { + return false; + } + } + return true; + }; + std::sort(list_in.begin(), list_in.end()); + for (const auto &pos: list_in) { + if (want(pos)) { + result.list.push_back(pos); + } + } + return result; + } + template <uint8_t N> + State next(const std::vector<bool> &bits) const { + std::vector<Position> tmp; + for (const auto &pos: list) { + pos.add_elementary_transitions<N>(bits, tmp); + } + return create<N>(std::move(tmp)); + } + template <uint8_t N> + std::vector<uint8_t> make_edit_vector() const { + std::vector<uint8_t> result(window_size<N>(), N + 1); + for (const auto &pos: list) { + for (uint32_t i = 0; i < window_size<N>(); ++i) { + result[i] = std::min(result[i], uint8_t(pos.materialize(i).edits)); + } + } + return result; + } + vespalib::string to_string() const { + vespalib::string result = "{"; + for (size_t i = 0; i < list.size(); ++i) { + if (i > 0) { + result.append(","); + } + result.append(list[i].to_string()); + } + result.append("}"); + return result; + } +}; + +// Keeps track of unique states, assigning an integer value to each +// state. Only states with minimal boundary 0 is allowed to be +// inserted into a state repo. Each repo is seeded with the empty +// state (0) and the start state (1). An assigned integer value can be +// mapped back into the originating state. +struct StateRepo { + using Map = std::map<State,uint32_t>; + using Ref = Map::iterator; + Map seen; + std::vector<Ref> refs; + StateRepo() noexcept : seen(), refs() { + auto failed_idx = state_to_idx(State::failed()); + auto start_idx = state_to_idx(State::start()); + assert(failed_idx == 0); + assert(start_idx == 1); + } + ~StateRepo(); + size_t size() const { return seen.size(); } + uint32_t state_to_idx(const State &state) { + assert(state.minimal_boundary() == 0); + uint32_t next = refs.size(); + auto [pos, inserted] = seen.emplace(state, next); + if (inserted) { + refs.push_back(pos); + } + assert(seen.size() == refs.size()); + return pos->second; + } + const State &idx_to_state(uint32_t idx) const { + assert(idx < refs.size()); + return refs[idx]->first; + } +}; +[[maybe_unused]] StateRepo::~StateRepo() = default; + +template <uint8_t N> +std::vector<bool> expand_bits(uint32_t value) { + static_assert(N < 10); + std::vector<bool> result(window_size<N>()); + uint32_t look_for = num_transitions<N>(); + assert(value < look_for); + for (size_t i = 0; i < result.size(); ++i) { + look_for >>= 1; + result[i] = (value & look_for); + } + return result; +} + +template <uint8_t N> +StateRepo make_state_repo() { + StateRepo repo; + for (uint32_t idx = 0; idx < repo.size(); ++idx) { + const State &state = repo.idx_to_state(idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + State new_state = state.next<N>(expand_bits<N>(i)); + (void) new_state.normalize(); + (void) repo.state_to_idx(new_state); + } + } + return repo; +} + +struct Transition { + uint8_t step; + uint8_t state; + constexpr Transition() noexcept : step(0), state(0) {} + constexpr Transition(uint8_t di, uint8_t ns) noexcept : step(di), state(ns) {} +}; + +template <uint8_t N> struct InlineTfa; +#include "inline_tfa.hpp" + +template <uint8_t N> +struct Tfa { + // what happens when following a transition from a state? + std::array<std::array<Transition,num_transitions<N>()>,num_states<N>()> table; + + // how many edits did we use to match the target word? + std::array<std::array<uint8_t,window_size<N>()>,num_states<N>()> edits; +}; + +template <uint8_t N> +std::unique_ptr<Tfa<N>> make_tfa() { + auto tfa = std::make_unique<Tfa<N>>(); + StateRepo repo; + uint32_t state_idx = 0; + for (; state_idx < repo.size(); ++state_idx) { + const State &state = repo.idx_to_state(state_idx); + for (uint32_t i = 0; i < num_transitions<N>(); ++i) { + State new_state = state.next<N>(expand_bits<N>(i)); + uint32_t step = new_state.normalize(); + uint32_t new_state_idx = repo.state_to_idx(new_state); + assert(step < 256); + assert(new_state_idx < 256); + tfa->table[state_idx][i].step = step; + tfa->table[state_idx][i].state = new_state_idx; + } + auto edits = state.make_edit_vector<N>(); + assert(edits.size() == window_size<N>()); + for (uint32_t i = 0; i < window_size<N>(); ++i) { + tfa->edits[state_idx][i] = edits[i]; + } + } + assert(repo.size() == num_states<N>()); + assert(state_idx == num_states<N>()); + return tfa; +} + +template <typename T> +vespalib::string format_vector(const std::vector<T> &vector, bool compact = false) { + vespalib::string str = compact ? "" : "["; + for (size_t i = 0; i < vector.size(); ++i) { + if (i > 0 && !compact) { + str.append(","); + } + str.append(fmt("%u", uint32_t(vector[i]))); + } + if (!compact) { + str.append("]"); + } + return str; +} + +// A table-based state using the InlineTfa tables to simulate stepping +// a dfa with max edit distance N. The state itself is represented by +// a number used as offset into these tables (state). Since the state +// is parametric, we also store the minimal boundary of the state +// separately (index). +template <uint8_t N> +struct TfaState { + uint32_t index; + uint32_t state; + // needed by dfa matcher concept (should use std::declval instead) + constexpr TfaState() noexcept : index(0), state(0) {} + constexpr TfaState(uint32_t i, uint32_t s) noexcept : index(i), state(s) {} + static constexpr TfaState start() { return TfaState(0, 1); } + constexpr bool valid() const noexcept { return state != 0; } + constexpr TfaState next(uint32_t bits) const noexcept { + const auto &entry = InlineTfa<N>::table[state][bits]; + return TfaState(index + entry.step, entry.state); + } + constexpr bool is_valid_edge(uint32_t bits) const noexcept { + return InlineTfa<N>::table[state][bits].state != 0; + } + constexpr uint8_t edits(uint32_t end) const noexcept { + uint32_t leap = (end - index); + return (leap < window_size<N>()) ? InlineTfa<N>::edits[state][leap] : N + 1; + } + // for pretty graphviz dumping; minimal possible edits given perfect input from here on + constexpr uint32_t min_edits() const noexcept { + uint8_t res = N + 1; + for (uint32_t i = 0; i < window_size<N>(); ++i) { + res = std::min(res, InlineTfa<N>::edits[state][i]); + } + return res; + } + // for pretty graphviz dumping; actual edits needed to reach the word end from a valid state + constexpr uint32_t exact_edits(uint32_t end) const noexcept { + assert(valid()); + uint32_t res = end; + for (uint32_t i = 0; i < window_size<N>(); ++i) { + if (uint32_t e = InlineTfa<N>::edits[state][i]; e <= N) { + res = std::min(res, e + diff(index + i, end)); + } + } + return res; + } + // for pretty graphviz dumping; enable using in map and set + bool operator<(const TfaState &rhs) const noexcept { + return std::tie(index, state) < std::tie(rhs.index, rhs.state); + } + // for pretty graphviz dumping; check for redundant edges + bool operator==(const TfaState &rhs) const noexcept { + return std::tie(index, state) == std::tie(rhs.index, rhs.state); + } +}; + +template <uint8_t N> +struct TableMatcher { + using S = TfaState<N>; + using StateType = TfaState<N>; + using StateParamType = TfaState<N>; + using EdgeType = uint32_t; + + const TableDfa<N>::Lookup *lookup; + const uint32_t end; + const bool cased; + + TableMatcher(const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in) + noexcept : lookup(lookup_in), end(end_in), cased(cased_in) {} + + bool is_cased() const noexcept { return cased; } + static constexpr S start() noexcept { return S::start(); } + + uint8_t match_edit_distance(S s) const noexcept { return s.edits(end); } + bool is_match(S s) const noexcept { return s.edits(end) <= N; } + + static constexpr bool can_match(S s) noexcept { return s.valid(); } + static constexpr bool valid_state(S) noexcept { return true; } + + S match_wildcard(S s) const noexcept { return s.next(0); } + S match_input(S s, uint32_t c) const noexcept { + const auto *slice = lookup[s.index].list.data(); + for (size_t i = 0; i < window_size<N>() && slice[i].input != 0; ++i) { + if (slice[i].input == c) { + return s.next(slice[i].match); + } + } + return match_wildcard(s); + } + + bool has_higher_out_edge(S s, uint32_t c) const noexcept { + if (s.is_valid_edge(0)) { + return true; + } + const auto *slice = lookup[s.index].list.data(); + for (size_t i = 0; i < window_size<N>() && slice[i].input > c; ++i) { + if (s.is_valid_edge(slice[i].match)) { + return true; + } + } + return false; + } + + bool has_exact_explicit_out_edge(S s, uint32_t c) const noexcept { + const auto *slice = lookup[s.index].list.data(); + for (size_t i = 0; i < window_size<N>() && slice[i].input >= c; ++i) { + if (slice[i].input == c) { + return s.is_valid_edge(slice[i].match); + } + } + return false; + } + + uint32_t lowest_higher_explicit_out_edge(S s, uint32_t c) const noexcept { + const auto *slice = lookup[s.index].list.data(); + size_t i = window_size<N>(); + while (i-- > 0) { + if (slice[i].input > c && s.is_valid_edge(slice[i].match)) { + return slice[i].input; + } + } + return 0; + } + + uint32_t smallest_explicit_out_edge(S s) const noexcept { + const auto *slice = lookup[s.index].list.data(); + size_t i = window_size<N>(); + while (i-- > 0) { + if (slice[i].input != 0 && s.is_valid_edge(slice[i].match)) { + return slice[i].input; + } + } + return 0; + } + + static constexpr bool valid_edge(uint32_t) noexcept { return true; } + static constexpr uint32_t edge_to_u32char(uint32_t c) noexcept { return c; } + S edge_to_state(S s, uint32_t c) const noexcept { return match_input(s, c); } + + static constexpr bool implies_exact_match_suffix(S) noexcept { return false; } + static constexpr void emit_exact_match_suffix(S, std::vector<uint32_t> &) {} // not called + static constexpr void emit_exact_match_suffix(S, std::string &) {} // not called +}; + +} // unnamed + +template <uint8_t N> +auto +TableDfa<N>::make_lookup(const std::vector<uint32_t> &str)->std::vector<Lookup> +{ + std::vector<Lookup> result(str.size() + 1); + auto have_already = [&](uint32_t c, size_t i)noexcept{ + for (size_t j = 0; j < window_size(); ++j) { + if (result[i].list[j].input == c) { + return true; + } + } + return false; + }; + auto make_vector = [&](uint32_t c, size_t i)noexcept{ + uint32_t bits = 0; + for (size_t j = 0; j < window_size(); ++j) { + bool found = ((i + j) < str.size()) && (str[i+j] == c); + bits = (bits << 1) + found; + } + return bits; + }; + for (size_t i = 0; i < str.size(); ++i) { + for (size_t j = 0; j < window_size(); ++j) { + assert(result[i].list[j].input == 0); + assert(result[i].list[j].match == 0); + if ((i + j) < str.size()) { + uint32_t c = str[i + j]; + if (!have_already(c, i)) { + result[i].list[j].input = c; + result[i].list[j].match = make_vector(c, i); + } + } + } + std::sort(result[i].list.begin(), result[i].list.end(), + [](const auto &a, const auto &b){ return a.input > b.input; }); + } + return result; +} + +template <uint8_t N> +TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased) + : _lookup(make_lookup(str)), + _is_cased(is_cased) +{ +} + +template <uint8_t N> +TableDfa<N>::~TableDfa() = default; + +template <uint8_t N> +LevenshteinDfa::MatchResult +TableDfa<N>::match(std::string_view u8str) const +{ + TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + return MatchAlgorithm<N>::match(matcher, u8str); +} + +template <uint8_t N> +LevenshteinDfa::MatchResult +TableDfa<N>::match(std::string_view u8str, std::string& successor_out) const +{ + TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + return MatchAlgorithm<N>::match(matcher, u8str, successor_out); +} + +template <uint8_t N> +LevenshteinDfa::MatchResult +TableDfa<N>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const +{ + TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + return MatchAlgorithm<N>::match(matcher, u8str, successor_out); +} + +template <uint8_t N> +size_t +TableDfa<N>::memory_usage() const noexcept +{ + return _lookup.size() * sizeof(Lookup); +} + +template <uint8_t N> +void +TableDfa<N>::dump_as_graphviz(std::ostream &os) const +{ + using state_t = TfaState<N>; + auto id_of = [state_ids = std::map<state_t,uint32_t>()](state_t state) mutable { + return state_ids.emplace(state, state_ids.size()).first->second; + }; + auto explore = [explored = std::set<state_t>()](state_t state) mutable { + return explored.insert(state).second; + }; + auto edits = [end = (_lookup.size() - 1)](state_t state) noexcept -> uint32_t { + return state.exact_edits(end); + }; + struct Edge { + uint32_t input; + state_t from; + state_t to; + operator bool() const { return to.valid(); } + }; + auto best_edge = [&](const Edge &a, const Edge &b) { + // inverted due to priority queue + if (a.to.min_edits() == b.to.min_edits()) { + return edits(a.to) > edits(b.to); + } + return a.to.min_edits() > b.to.min_edits(); + }; + auto todo = std::priority_queue<Edge,std::vector<Edge>,decltype(best_edge)>(best_edge); + auto handle_state = [&](state_t state) { + if (explore(state)) { + // number states by following the best edges first + auto my_id = id_of(state); + if (edits(state) <= N) { + os << " " << my_id << " [label=\"" << my_id << "(" << edits(state) << ")\", style=\"filled\"];\n"; + } + auto null_edge = Edge{0, state, state.next(0)}; + if (null_edge) { + todo.push(null_edge); + } + for (auto [c, bits]: _lookup[state.index].list) { + if (auto edge = Edge{c, state, state.next(bits)}; edge && edge.to != null_edge.to) { + // only process valid out edges that are not covered by the null_edge + todo.push(edge); + } + } + } + }; + auto handle_edge = [&](Edge edge) { + handle_state(edge.to); + if (edge.input != 0) { + std::string as_utf8; + append_utf32_char(as_utf8, edge.input); + os << " " << id_of(edge.from) << " -> " << id_of(edge.to) << " [label=\"" << as_utf8 << "\"];\n"; + } else { + os << " " << id_of(edge.from) << " -> " << id_of(edge.to) << " [label=\"*\"];\n"; + } + }; + os << std::dec << "digraph table_dfa {\n"; + os << " fontname=\"Helvetica,Arial,sans-serif\"\n"; + os << " node [shape=circle, fontname=\"Helvetica,Arial,sans-serif\", fixedsize=true];\n"; + os << " edge [fontname=\"Helvetica,Arial,sans-serif\"];\n"; + handle_state(state_t::start()); + while (!todo.empty()) { + auto next_edge = todo.top(); + todo.pop(); + handle_edge(next_edge); + } + os << "}\n"; +} + +} diff --git a/vespalib/src/vespa/vespalib/stllike/hashtable.h b/vespalib/src/vespa/vespalib/stllike/hashtable.h index e290d2f626c..fa88bb038b4 100644 --- a/vespalib/src/vespa/vespalib/stllike/hashtable.h +++ b/vespalib/src/vespa/vespalib/stllike/hashtable.h @@ -62,7 +62,7 @@ public: class prime_modulator { public: - prime_modulator(next_t sizeOfHashTable) noexcept : _modulo(sizeOfHashTable) { } + explicit prime_modulator(next_t sizeOfHashTable) noexcept : _modulo(sizeOfHashTable) { } next_t modulo(next_t hash) const noexcept { return hash % _modulo; } next_t getTableSize() const noexcept { return _modulo; } static next_t selectHashTableSize(size_t sz) { return hashtable_base::getModuloStl(sz); } @@ -76,7 +76,7 @@ public: class and_modulator { public: - and_modulator(next_t sizeOfHashTable) noexcept : _mask(sizeOfHashTable-1) { } + explicit and_modulator(next_t sizeOfHashTable) noexcept : _mask(sizeOfHashTable-1) { } next_t modulo(next_t hash) const noexcept { return hash & _mask; } next_t getTableSize() const noexcept { return _mask + 1; } static next_t selectHashTableSize(size_t sz) noexcept { return hashtable_base::getModuloSimple(sz); } @@ -198,7 +198,7 @@ public: using pointer = Value*; using iterator_category = std::forward_iterator_tag; - constexpr iterator(hashtable * hash) noexcept : _current(0), _hashTable(hash) { + constexpr explicit iterator(hashtable * hash) noexcept : _current(0), _hashTable(hash) { if (! _hashTable->_nodes[_current].valid()) { advanceToNextValidHash(); } @@ -242,7 +242,7 @@ public: using pointer = const Value*; using iterator_category = std::forward_iterator_tag; - constexpr const_iterator(const hashtable * hash) noexcept : _current(0), _hashTable(hash) { + constexpr explicit const_iterator(const hashtable * hash) noexcept : _current(0), _hashTable(hash) { if (! _hashTable->_nodes[_current].valid()) { advanceToNextValidHash(); } @@ -282,7 +282,7 @@ public: hashtable & operator = (hashtable &&) noexcept = default; hashtable(const hashtable &); hashtable & operator = (const hashtable &); - hashtable(size_t reservedSpace); + explicit hashtable(size_t reservedSpace); hashtable(size_t reservedSpace, const Hash & hasher, const Equal & equal); virtual ~hashtable(); constexpr iterator begin() noexcept { return iterator(this); } diff --git a/vespalib/src/vespa/vespalib/stllike/hashtable.hpp b/vespalib/src/vespa/vespalib/stllike/hashtable.hpp index 6d2d397a887..040e421f68c 100644 --- a/vespalib/src/vespa/vespalib/stllike/hashtable.hpp +++ b/vespalib/src/vespa/vespalib/stllike/hashtable.hpp @@ -53,8 +53,7 @@ hashtable<Key, Value, Hash, Equal, KeyExtract, Modulator>::hashtable(size_t rese _nodes(createStore<NodeStore>(reservedSpace, _modulator.getTableSize())), _hasher(hasher), _equal(equal) -{ -} +{ } template< typename Key, typename Value, typename Hash, typename Equal, typename KeyExtract, typename Modulator > hashtable<Key, Value, Hash, Equal, KeyExtract, Modulator>::hashtable(const hashtable &) = default; @@ -130,7 +129,7 @@ hashtable<Key, Value, Hash, Equal, KeyExtract, Modulator>::insert_internal(V && _count++; return insert_result(iterator(this, h), true); } - return insert_internal_cold(std::move(node), h); + return insert_internal_cold(std::forward<V>(node), h); } template< typename Key, typename Value, typename Hash, typename Equal, typename KeyExtract, typename Modulator > diff --git a/vespalib/src/vespa/vespalib/text/utf8.h b/vespalib/src/vespa/vespalib/text/utf8.h index 99e3f8cfe13..489b16b1ed4 100644 --- a/vespalib/src/vespa/vespalib/text/utf8.h +++ b/vespalib/src/vespa/vespalib/text/utf8.h @@ -321,6 +321,7 @@ public: return i; } + const char* get_current_ptr() const noexcept { return _p; } }; diff --git a/vespalib/src/vespa/vespalib/util/address_space.h b/vespalib/src/vespa/vespalib/util/address_space.h index 948217bfd2b..fd172427720 100644 --- a/vespalib/src/vespa/vespalib/util/address_space.h +++ b/vespalib/src/vespa/vespalib/util/address_space.h @@ -2,6 +2,7 @@ #pragma once +#include <cstddef> #include <iosfwd> namespace vespalib { |