aboutsummaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/CMakeLists.txt1
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp50
-rw-r--r--vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt9
-rw-r--r--vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp395
-rw-r--r--vespalib/src/tests/memorydatastore/memorydatastore.cpp56
-rw-r--r--vespalib/src/tests/stllike/hash_test.cpp8
-rw-r--r--vespalib/src/vespa/vespalib/btree/btreeiterator.h10
-rw-r--r--vespalib/src/vespa/vespalib/btree/btreeiterator.hpp8
-rw-r--r--vespalib/src/vespa/vespalib/btree/btreenode.h18
-rw-r--r--vespalib/src/vespa/vespalib/data/memorydatastore.cpp17
-rw-r--r--vespalib/src/vespa/vespalib/data/memorydatastore.h85
-rw-r--r--vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h2
-rw-r--r--vespalib/src/vespa/vespalib/data/slime/slime.cpp30
-rw-r--r--vespalib/src/vespa/vespalib/data/slime/slime.h39
-rw-r--r--vespalib/src/vespa/vespalib/data/slime/symbol.h4
-rw-r--r--vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp22
-rw-r--r--vespalib/src/vespa/vespalib/data/slime/symbol_table.h13
-rw-r--r--vespalib/src/vespa/vespalib/datastore/compaction_strategy.h3
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt1
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp5
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h3
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/inline_tfa.hpp91
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp16
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h38
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp76
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/sparse_state.h6
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp10
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/table_dfa.h63
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp586
-rw-r--r--vespalib/src/vespa/vespalib/stllike/hashtable.h10
-rw-r--r--vespalib/src/vespa/vespalib/stllike/hashtable.hpp5
-rw-r--r--vespalib/src/vespa/vespalib/text/utf8.h1
-rw-r--r--vespalib/src/vespa/vespalib/util/address_space.h1
33 files changed, 1348 insertions, 334 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index c1d7e17b457..56dcd9abdf6 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -96,6 +96,7 @@ vespa_define_module(
src/tests/fileheader
src/tests/floatingpointtype
src/tests/fuzzy
+ src/tests/fuzzy/table_dfa
src/tests/gencnt
src/tests/growablebytebuffer
src/tests/guard
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index c235cb99509..919ba328085 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -11,6 +11,7 @@
#include <string>
#include <string_view>
#include <gtest/gtest.h>
+#include <gmock/gmock.h>
using namespace ::testing;
using namespace vespalib::fuzzy;
@@ -82,7 +83,8 @@ INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes,
Combine(Values(LevenshteinDfa::Casing::Uncased,
LevenshteinDfa::Casing::Cased),
Values(LevenshteinDfa::DfaType::Explicit,
- LevenshteinDfa::DfaType::Implicit)),
+ LevenshteinDfa::DfaType::Implicit,
+ LevenshteinDfa::DfaType::Table)),
LevenshteinDfaTest::stringify_params);
// Same as existing non-DFA Levenshtein tests, but with some added instantiations
@@ -122,8 +124,10 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt);
}
-void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) {
- std::string successor;
+void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source,
+ std::string_view expected_successor, std::string_view successor_prefix)
+{
+ std::string successor(successor_prefix);
auto m = dfa.match(source, successor);
if (m.matches()) {
FAIL() << "Expected '" << source << "' to emit a successor, but it "
@@ -131,15 +135,21 @@ void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std:
<< " edits (of max " << static_cast<uint32_t>(m.max_edits()) << " edits)";
}
EXPECT_EQ(successor, expected_successor);
- EXPECT_TRUE(dfa.match(successor).matches());
+ // Must skip any caller-provided successor prefix before checking if it matches the target
+ auto successor_suffix = successor.substr(successor_prefix.size());
+ EXPECT_TRUE(dfa.match(successor_suffix).matches());
// Make sure the UTF-32 successor output is codepoint-wise identical to the UTF-8 successor
- std::vector<uint32_t> u32successor;
+ std::vector<uint32_t> u32successor(utf8_string_to_utf32(successor_prefix));
m = dfa.match(source, u32successor);
EXPECT_FALSE(m.matches());
expect_utf32_string_code_point_equal_to_utf8(u32successor, successor);
}
+void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) {
+ test_dfa_successor(dfa, source, expected_successor, {});
+}
+
TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) {
auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
@@ -201,6 +211,28 @@ TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) {
test_dfa_successor(dfa, "vespa", "w");
}
+TEST_P(LevenshteinDfaTest, caller_provided_successor_prefix_is_preserved_on_mismatch) {
+ auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
+
+ // Same inputs as existing successor tests, but with a preserved prefix in the generated successor
+ test_dfa_successor(dfa, "", "yolo\x01""food", "yolo");
+ test_dfa_successor(dfa, "faa", "xyzfaod", "xyz");
+ test_dfa_successor(dfa, "fooooo", "ABCfoop", "ABC");
+ test_dfa_successor(dfa, "ooof", "ABCpfood", "ABC");
+ test_dfa_successor(dfa, "gp", "yolohfood", "yolo");
+
+ dfa = LevenshteinDfa::build("", 1, casing(), dfa_type());
+ test_dfa_successor(dfa, "aa", "foob", "foo");
+}
+
+TEST_P(LevenshteinDfaTest, caller_provided_successor_prefix_is_preserved_on_match) {
+ auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
+ std::string successor = "bar";
+ auto m = dfa.match("mood", successor);
+ EXPECT_TRUE(m.matches());
+ EXPECT_THAT(successor, StartsWith("bar"));
+}
+
// We should normally be able to rely on higher-level components to ensure we
// only receive valid UTF-8, but make sure we don't choke on it if we do get it.
TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) {
@@ -233,7 +265,8 @@ struct LevenshteinDfaCasingTest : TestWithParam<LevenshteinDfa::DfaType> {
INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
LevenshteinDfaCasingTest,
Values(LevenshteinDfa::DfaType::Explicit,
- LevenshteinDfa::DfaType::Implicit),
+ LevenshteinDfa::DfaType::Implicit,
+ LevenshteinDfa::DfaType::Table),
PrintToStringParamName());
TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) {
@@ -315,7 +348,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
Combine(Values(LevenshteinDfa::Casing::Uncased,
LevenshteinDfa::Casing::Cased),
Values(LevenshteinDfa::DfaType::Explicit,
- LevenshteinDfa::DfaType::Implicit),
+ LevenshteinDfa::DfaType::Implicit,
+ LevenshteinDfa::DfaType::Table),
Values(1, 2)),
LevenshteinDfaSuccessorTest::stringify_params);
@@ -342,6 +376,7 @@ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
std::string skip_to, successor;
for (uint32_t j = 0; j < 256; ++j) {
const auto source = bits_to_str(static_cast<uint8_t>(j));
+ successor.clear();
auto maybe_match = target_dfa.match(source, successor);
if (maybe_match.matches() && !skip_to.empty()) {
ASSERT_GE(source, skip_to);
@@ -597,6 +632,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) {
auto end = dict.cend();
std::string successor;
while (iter != end) {
+ successor.clear();
auto maybe_match = dfa.match(*iter, successor);
if (maybe_match.matches()) {
++iter;
diff --git a/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt
new file mode 100644
index 00000000000..1017ac99564
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/table_dfa/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vespalib_fuzzy_table_dfa_test_app TEST
+ SOURCES
+ table_dfa_test.cpp
+ DEPENDS
+ vespalib
+ GTest::GTest
+ )
+vespa_add_test(NAME vespalib_fuzzy_table_dfa_test_app COMMAND vespalib_fuzzy_table_dfa_test_app)
diff --git a/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
new file mode 100644
index 00000000000..acb68a56de7
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
@@ -0,0 +1,395 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/fuzzy/table_dfa.hpp>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <set>
+
+using namespace ::testing;
+using namespace vespalib::fuzzy;
+
+// test/experiment with low-level concepts underlying the construction
+// of the tables used in the table-driven dfa implementation.
+
+TEST(TableDfaTest, position) {
+ Position pos1 = Position::start();
+ EXPECT_EQ(pos1.index, 0);
+ EXPECT_EQ(pos1.edits, 0);
+ Position pos2(2, 3);
+ EXPECT_EQ(pos2.index, 2);
+ EXPECT_EQ(pos2.edits, 3);
+}
+
+TEST(TableDfaTest, position_equality) {
+ Position pos1(0, 0);
+ Position pos2(0, 1);
+ Position pos3(1, 0);
+ EXPECT_TRUE(pos1 == pos1);
+ EXPECT_FALSE(pos1 == pos2);
+ EXPECT_FALSE(pos1 == pos2);
+}
+
+TEST(TableDfaTest, position_sort_order) {
+ std::vector<Position> list;
+ list.emplace_back(0,1);
+ list.emplace_back(0,0);
+ list.emplace_back(1,0);
+ list.emplace_back(1,1);
+ std::sort(list.begin(), list.end());
+ EXPECT_EQ(list[0].index, 0);
+ EXPECT_EQ(list[0].edits, 0);
+ EXPECT_EQ(list[1].index, 1);
+ EXPECT_EQ(list[1].edits, 0);
+ EXPECT_EQ(list[2].index, 0);
+ EXPECT_EQ(list[2].edits, 1);
+ EXPECT_EQ(list[3].index, 1);
+ EXPECT_EQ(list[3].edits, 1);
+}
+
+TEST(TableDfaTest, position_subsumption) {
+ Position pos1(0, 0);
+ Position pos2(0, 1);
+ Position pos3(0, 2);
+
+ Position pos4(1, 0);
+ Position pos5(1, 1);
+ Position pos6(1, 2);
+
+ Position pos7(2, 0);
+ Position pos8(2, 1);
+ Position pos9(2, 2);
+
+ EXPECT_FALSE(pos1.subsumes(pos1));
+ EXPECT_TRUE(pos1.subsumes(pos2));
+ EXPECT_TRUE(pos1.subsumes(pos3));
+ EXPECT_FALSE(pos1.subsumes(pos4));
+ EXPECT_TRUE(pos1.subsumes(pos5));
+ EXPECT_TRUE(pos1.subsumes(pos6));
+ EXPECT_FALSE(pos1.subsumes(pos7));
+ EXPECT_FALSE(pos1.subsumes(pos8));
+ EXPECT_TRUE(pos1.subsumes(pos9));
+
+ EXPECT_FALSE(pos5.subsumes(pos1));
+ EXPECT_FALSE(pos5.subsumes(pos2));
+ EXPECT_TRUE(pos5.subsumes(pos3));
+ EXPECT_FALSE(pos5.subsumes(pos4));
+ EXPECT_FALSE(pos5.subsumes(pos5));
+ EXPECT_TRUE(pos5.subsumes(pos6));
+ EXPECT_FALSE(pos5.subsumes(pos7));
+ EXPECT_FALSE(pos5.subsumes(pos8));
+ EXPECT_TRUE(pos5.subsumes(pos9));
+}
+
+TEST(TableDfaTest, position_materialization) {
+ EXPECT_EQ(Position(1,1).materialize(0).index, 0);
+ EXPECT_EQ(Position(1,1).materialize(1).index, 1);
+ EXPECT_EQ(Position(1,1).materialize(2).index, 2);
+ EXPECT_EQ(Position(1,1).materialize(0).edits, 2);
+ EXPECT_EQ(Position(1,1).materialize(1).edits, 1);
+ EXPECT_EQ(Position(1,1).materialize(2).edits, 2);
+}
+
+TEST(TableDfaTest, position_to_string) {
+ Position pos1(0, 0);
+ Position pos2(1, 2);
+ Position pos3(2, 3);
+ EXPECT_EQ(pos1.to_string(), fmt("0#0"));
+ EXPECT_EQ(pos2.to_string(), fmt("1#2"));
+ EXPECT_EQ(pos3.to_string(), fmt("2#3"));
+}
+
+TEST(TableDfaTest, state_creation_reorder) {
+ EXPECT_EQ(State::create<5>({{0,1},{2,0}}).to_string(), fmt("{2#0,0#1}"));
+ EXPECT_EQ(State::create<5>({{2,0},{0,0}}).to_string(), fmt("{0#0,2#0}"));
+}
+
+TEST(TableDfaTest, state_creation_duplicate_removal) {
+ EXPECT_EQ(State::create<5>({{0,0},{0,0},{2,1},{2,1}}).to_string(), fmt("{0#0,2#1}"));
+}
+
+TEST(TableDfaTest, state_creation_edit_cutoff) {
+ EXPECT_EQ(State::create<2>({{0,0},{5,2},{10,3}}).to_string(), fmt("{0#0,5#2}"));
+}
+
+TEST(TableDfaTest, state_creation_subsumption_collapsing) {
+ EXPECT_EQ(State::create<2>({{0,0},{1,1}}).to_string(), fmt("{0#0}"));
+ EXPECT_EQ(State::create<2>({{0,1},{1,0}}).to_string(), fmt("{1#0}"));
+ EXPECT_EQ(State::create<2>({{0,0},{2,2}}).to_string(), fmt("{0#0}"));
+ EXPECT_EQ(State::create<2>({{0,2},{2,0}}).to_string(), fmt("{2#0}"));
+}
+
+TEST(TableDfaTest, state_normalization) {
+ auto state1 = State::create<2>({{2,1},{3,1}});
+ auto state2 = State::create<2>({{5,0},{3,1}});
+ EXPECT_EQ(state1.to_string(), fmt("{2#1,3#1}"));
+ EXPECT_EQ(state2.to_string(), fmt("{5#0,3#1}"));
+ EXPECT_EQ(state1.normalize(), 2);
+ EXPECT_EQ(state2.normalize(), 3);
+ EXPECT_EQ(state1.to_string(), fmt("{0#1,1#1}"));
+ EXPECT_EQ(state2.to_string(), fmt("{2#0,0#1}"));
+}
+
+TEST(TableDfaTest, state_repo) {
+ StateRepo repo;
+ EXPECT_EQ(repo.state_to_idx(State::failed()), 0);
+ EXPECT_EQ(repo.state_to_idx(State::start()), 1);
+ EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2);
+ EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3);
+ EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{1,0}})), 2);
+ EXPECT_EQ(repo.state_to_idx(State::create<2>({{0,0},{2,1}})), 3);
+ EXPECT_EQ(repo.size(), 4);
+ EXPECT_EQ(repo.idx_to_state(0).to_string(), fmt("{}"));
+ EXPECT_EQ(repo.idx_to_state(1).to_string(), fmt("{0#0}"));
+ EXPECT_EQ(repo.idx_to_state(2).to_string(), fmt("{0#0,1#0}"));
+ EXPECT_EQ(repo.idx_to_state(3).to_string(), fmt("{0#0,2#1}"));
+}
+
+TEST(TableDfaTest, expand_bits) {
+ auto yes = expand_bits<2>(0x1f);
+ auto no = expand_bits<2>(0x00);
+ auto odd = expand_bits<2>(0x0a);
+ auto even = expand_bits<2>(0x15);
+ ASSERT_EQ(yes.size(), 5);
+ ASSERT_EQ(no.size(), 5);
+ ASSERT_EQ(odd.size(), 5);
+ ASSERT_EQ(even.size(), 5);
+ for (size_t i = 0; i < 5; ++i) {
+ EXPECT_TRUE(yes[i]);
+ EXPECT_FALSE(no[i]);
+ EXPECT_EQ(odd[i], bool(i % 2 == 1));
+ EXPECT_EQ(even[i], bool(i % 2 == 0));
+ }
+}
+
+TEST(TableDfaTest, format_bits) {
+ EXPECT_EQ(format_vector(expand_bits<1>(0)), fmt("[0,0,0]"));
+ EXPECT_EQ(format_vector(expand_bits<1>(7)), fmt("[1,1,1]"));
+ EXPECT_EQ(format_vector(expand_bits<1>(5)), fmt("[1,0,1]"));
+ EXPECT_EQ(format_vector(expand_bits<1>(2)), fmt("[0,1,0]"));
+ EXPECT_EQ(format_vector(expand_bits<2>(31)), fmt("[1,1,1,1,1]"));
+ EXPECT_EQ(format_vector(expand_bits<2>(21)), fmt("[1,0,1,0,1]"));
+ EXPECT_EQ(format_vector(expand_bits<2>(31), true), fmt("11111"));
+ EXPECT_EQ(format_vector(expand_bits<2>(21), true), fmt("10101"));
+}
+
+template <uint8_t N>
+void list_states() {
+ auto repo = make_state_repo<N>();
+ EXPECT_EQ(num_states<N>(), repo.size());
+ fprintf(stderr, "max_edits: %u, number of states: %zu\n", N, repo.size());
+ for (uint32_t i = 0; i < repo.size(); ++i) {
+ fprintf(stderr, " state %u: %s\n", i, repo.idx_to_state(i).to_string().c_str());
+ }
+}
+
+TEST(TableDfaTest, list_states_for_max_edits_1) { list_states<1>(); }
+TEST(TableDfaTest, list_states_for_max_edits_2) { list_states<2>(); }
+
+template <uint8_t N>
+void list_edits() {
+ auto repo = make_state_repo<N>();
+ fprintf(stderr,
+ "per state, listing the minimal number of edits needed\n"
+ "to reach offsets at and beyond its minimal boundary\n");
+ for (uint32_t i = 0; i < repo.size(); ++i) {
+ const State &state = repo.idx_to_state(i);
+ fprintf(stderr, "%-23s : %s\n", state.to_string().c_str(),
+ format_vector(state.make_edit_vector<N>()).c_str());
+ }
+}
+
+TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_1) { list_edits<1>(); }
+TEST(TableDfaTest, list_edits_at_input_end_for_max_edits_2) { list_edits<2>(); }
+
+template <uint8_t N>
+void list_transitions() {
+ auto repo = make_state_repo<N>();
+ for (uint32_t idx = 0; idx < repo.size(); ++idx) {
+ const State &state = repo.idx_to_state(idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ auto bits = expand_bits<N>(i);
+ State new_state = state.next<N>(bits);
+ uint32_t step = new_state.normalize();
+ uint32_t new_idx = repo.state_to_idx(new_state);
+ ASSERT_LT(new_idx, repo.size());
+ fprintf(stderr, "%u:%s,i --%s--> %u:%s,%s\n", idx, state.to_string().c_str(),
+ format_vector(bits).c_str(), new_idx, new_state.to_string().c_str(),
+ (step == 0) ? "i" : fmt("i+%u", step).c_str());
+ }
+ }
+}
+
+TEST(TableDfaTest, list_transitions_for_max_edits_1) { list_transitions<1>(); }
+
+// Simulate all possible ways we can approach the end of the word we
+// are matching. Verify that no transition taken can produce a state
+// with a minimal boundary that exceeds the boundary of the word
+// itself. Verifying this will enable us to not care about word size
+// while simulating the dfa.
+template <uint8_t N>
+void verify_word_end_boundary() {
+ auto repo = make_state_repo<N>();
+ using StateSet = std::set<uint32_t>;
+ std::vector<StateSet> active(window_size<N>() + 1);
+ for (size_t i = 1; i < repo.size(); ++i) {
+ active[0].insert(i);
+ }
+ EXPECT_EQ(active.size(), window_size<N>() + 1);
+ EXPECT_EQ(active[0].size(), repo.size() - 1);
+ fprintf(stderr, "verifying word end for max edits %u\n", N);
+ uint32_t edge_shape = 0;
+ for (uint32_t active_idx = 0; active_idx < active.size(); ++active_idx) {
+ fprintf(stderr, " edge shape: %s, max step: %zu, active_states: %zu\n",
+ format_vector(expand_bits<N>(edge_shape)).c_str(), active.size() - active_idx - 1, active[active_idx].size());
+ for (uint32_t idx: active[active_idx]) {
+ const State &state = repo.idx_to_state(idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ if ((i & edge_shape) == 0) {
+ State new_state = state.next<N>(expand_bits<N>(i));
+ uint32_t step = new_state.normalize();
+ uint32_t new_idx = repo.state_to_idx(new_state);
+ ASSERT_LT(new_idx, repo.size());
+ if (new_idx != 0) {
+ ASSERT_GT(active.size(), active_idx + step);
+ active[active_idx + step].insert(new_idx);
+ }
+ }
+ }
+ }
+ edge_shape = (edge_shape << 1) + 1;
+ }
+ EXPECT_EQ(edge_shape, (1 << (window_size<N>() + 1)) - 1);
+ while (!active.back().empty()) {
+ fprintf(stderr, " residue states after word end: %zu\n", active.back().size());
+ StateSet residue;
+ for (uint32_t idx: active.back()) {
+ const State &state = repo.idx_to_state(idx);
+ State new_state = state.next<N>(expand_bits<N>(0));
+ uint32_t step = new_state.normalize();
+ uint32_t new_idx = repo.state_to_idx(new_state);
+ ASSERT_LT(new_idx, repo.size());
+ ASSERT_EQ(step, 0);
+ if (new_idx != 0) {
+ residue.insert(new_idx);
+ }
+ }
+ active.back() = std::move(residue);
+ }
+}
+
+TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_1) {
+ verify_word_end_boundary<1>();
+}
+
+TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_2) {
+ verify_word_end_boundary<2>();
+}
+
+template <uint8_t N>
+void verify_inline_tfa() {
+ auto tfa = make_tfa<N>();
+ fprintf(stderr, "verifying TFA for N = %u (byte size: %zu)\n", N, sizeof(*tfa));
+ ASSERT_EQ(tfa->table.size(), num_states<N>());
+ ASSERT_EQ(tfa->edits.size(), num_states<N>());
+ for (size_t state = 0; state < num_states<N>(); ++state) {
+ ASSERT_EQ(tfa->table[state].size(), num_transitions<N>());
+ for (size_t transition = 0; transition < num_transitions<N>(); ++transition) {
+ EXPECT_EQ(tfa->table[state][transition].step, InlineTfa<N>::table[state][transition].step);
+ EXPECT_EQ(tfa->table[state][transition].state, InlineTfa<N>::table[state][transition].state);
+ }
+ ASSERT_EQ(tfa->edits[state].size(), window_size<N>());
+ for (size_t offset = 0; offset < window_size<N>(); ++offset) {
+ EXPECT_EQ(tfa->edits[state][offset], InlineTfa<N>::edits[state][offset]);
+ }
+ }
+}
+
+TEST(TableDfaTest, verify_inline_tfa_with_max_edits_1) {
+ verify_inline_tfa<1>();
+}
+
+TEST(TableDfaTest, verify_inline_tfa_with_max_edits_2) {
+ verify_inline_tfa<2>();
+}
+
+template <uint8_t N>
+void dump_tfa_as_code() {
+ auto tfa = make_tfa<N>();
+ fprintf(stderr, "// start of auto-generated code for N = %u\n", N);
+ fprintf(stderr, "template <> struct InlineTfa<%u> {\n", N);
+ fprintf(stderr, " static constexpr Transition table[%zu][%zu] = {\n", num_states<N>(), num_transitions<N>());
+ for (size_t state = 0; state < num_states<N>(); ++state) {
+ fprintf(stderr, " {");
+ for (size_t transition = 0; transition < num_transitions<N>(); ++transition) {
+ if (transition > 0) {
+ fprintf(stderr, ",");
+ }
+ fprintf(stderr, "{%u,%u}", tfa->table[state][transition].step, tfa->table[state][transition].state);
+ }
+ fprintf(stderr, "}%s\n", ((state + 1) < num_states<N>()) ? "," : "");
+ }
+ fprintf(stderr, " };\n");
+ fprintf(stderr, " static constexpr uint8_t edits[%zu][%zu] = {\n", num_states<N>(), window_size<N>());
+ for (size_t state = 0; state < num_states<N>(); ++state) {
+ fprintf(stderr, " {");
+ for (size_t offset = 0; offset < window_size<N>(); ++offset) {
+ if (offset > 0) {
+ fprintf(stderr, ",");
+ }
+ fprintf(stderr, "%u", tfa->edits[state][offset]);
+ }
+ fprintf(stderr, "}%s\n", ((state + 1) < num_states<N>()) ? "," : "");
+ }
+ fprintf(stderr, " };\n");
+ fprintf(stderr, "};\n");
+ fprintf(stderr, "// end of auto-generated code for N = %u\n", N);
+}
+
+TEST(TableDfaTest, dump_tfa_with_max_edits_1_as_code) {
+ dump_tfa_as_code<1>();
+}
+
+TEST(TableDfaTest, dump_tfa_with_max_edits_2_as_code) {
+ dump_tfa_as_code<2>();
+}
+
+template <uint8_t N>
+void dump_tfa_graph() {
+ auto repo = make_state_repo<N>();
+ fprintf(stderr, "digraph tfa {\n");
+ for (uint32_t idx = 0; idx < repo.size(); ++idx) {
+ fprintf(stderr, " %u [label=\"%s\"];\n", idx,
+ repo.idx_to_state(idx).to_string().c_str());
+ }
+ // omit transitions from the failure state to itself
+ for (uint32_t idx = 1; idx < repo.size(); ++idx) {
+ const State &state = repo.idx_to_state(idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ auto bits = expand_bits<N>(i);
+ State new_state = state.next<N>(bits);
+ uint32_t step = new_state.normalize();
+ uint32_t new_idx = repo.state_to_idx(new_state);
+ ASSERT_LT(new_idx, repo.size());
+ if (bits[0] && idx == new_idx && step == 1) {
+ // omit simple transitions to yourself
+ } else {
+ fprintf(stderr, " %u -> %u [label=\"%s,%u\"];\n", idx, new_idx,
+ format_vector(bits, true).c_str(), step);
+ }
+ }
+ }
+ fprintf(stderr, "}\n");
+}
+
+TEST(TableDfaTest, graphviz_for_tfa_with_max_edits_1) {
+ dump_tfa_graph<1>();
+}
+
+TEST(TableDfaTest, graphviz_for_food_with_max_edits_1) {
+ auto dfa = LevenshteinDfa::build("food", 1, LevenshteinDfa::Casing::Cased, LevenshteinDfa::DfaType::Table);
+ std::ostringstream out;
+ dfa.dump_as_graphviz(out);
+ fprintf(stderr, "memory usage: %zu\n", dfa.memory_usage());
+ fprintf(stderr, "%s", out.str().c_str());
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/tests/memorydatastore/memorydatastore.cpp b/vespalib/src/tests/memorydatastore/memorydatastore.cpp
index 1d49b0af91b..7eab32601de 100644
--- a/vespalib/src/tests/memorydatastore/memorydatastore.cpp
+++ b/vespalib/src/tests/memorydatastore/memorydatastore.cpp
@@ -6,19 +6,9 @@
using namespace vespalib;
-class MemoryDataStoreTest : public vespalib::TestApp
+TEST("testMemoryDataStore")
{
-private:
- void testMemoryDataStore();
- void testVariableSizeVector();
-public:
- int Main() override;
-};
-
-void
-MemoryDataStoreTest::testMemoryDataStore()
-{
- MemoryDataStore s(alloc::Alloc::alloc(256));
+ MemoryDataStore s(alloc::Alloc::alloc(256), nullptr);
std::vector<MemoryDataStore::Reference> v;
v.push_back(s.push_back("mumbo", 5));
for (size_t i(0); i < 50; i++) {
@@ -28,45 +18,9 @@ MemoryDataStoreTest::testMemoryDataStore()
v.push_back(s.push_back("mumbo", 5));
EXPECT_EQUAL(52ul, v.size());
EXPECT_NOT_EQUAL(static_cast<const char *>(v[50].data()) + 5, v[51].data());
- for (size_t i(0); i < v.size(); i++) {
- EXPECT_EQUAL(0, memcmp("mumbo", v[i].data(), 5));
+ for (auto & i : v) {
+ EXPECT_EQUAL(0, memcmp("mumbo", i.data(), 5));
}
}
-void
-MemoryDataStoreTest::testVariableSizeVector()
-{
- VariableSizeVector v(20000, 5*20000);
- for (size_t i(0); i < 10000; i++) {
- asciistream os;
- os << i;
- v.push_back(os.str().data(), os.str().size());
- }
- for (size_t i(0); i < v.size(); i++) {
- asciistream os;
- os << i;
- EXPECT_EQUAL(os.str().size(), v[i].size());
- EXPECT_EQUAL(0, memcmp(os.str().data(), v[i].data(), os.str().size()));
- }
- size_t i(0);
- for (auto it(v.begin()), mt(v.end()); it != mt; it++, i++) {
- asciistream os;
- os << i;
- EXPECT_EQUAL(os.str().size(), it->size());
- EXPECT_EQUAL(0, memcmp(os.str().data(), (*it).data(), os.str().size()));
- }
-
-}
-
-int
-MemoryDataStoreTest::Main()
-{
- TEST_INIT("data_test");
- testMemoryDataStore();
- testVariableSizeVector();
-
- TEST_DONE();
-}
-
-TEST_APPHOOK(MemoryDataStoreTest);
-
+TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/vespalib/src/tests/stllike/hash_test.cpp b/vespalib/src/tests/stllike/hash_test.cpp
index ae27d2dc58b..dc6e48100a9 100644
--- a/vespalib/src/tests/stllike/hash_test.cpp
+++ b/vespalib/src/tests/stllike/hash_test.cpp
@@ -494,6 +494,14 @@ TEST("test hash set initializer list - empty")
EXPECT_EQUAL(0u, s.size());
}
+TEST("empty hash_set can be looked up")
+{
+ IntHashSet s;
+ EXPECT_EQUAL(0u, s.size());
+ EXPECT_EQUAL(1u, s.capacity());
+ EXPECT_TRUE(s.find(1) == s.end());
+}
+
TEST("test hash set initializer list - 1 element")
{
IntHashSet s = {1};
diff --git a/vespalib/src/vespa/vespalib/btree/btreeiterator.h b/vespalib/src/vespa/vespalib/btree/btreeiterator.h
index 2418da18c23..9480e5880e0 100644
--- a/vespalib/src/vespa/vespalib/btree/btreeiterator.h
+++ b/vespalib/src/vespa/vespalib/btree/btreeiterator.h
@@ -302,22 +302,22 @@ public:
/**
* Get key at current iterator location.
*/
- const KeyType & getKey() const { return _leaf.getKey(); }
+ const KeyType & getKey() const noexcept { return _leaf.getKey(); }
/**
* Get data at current iterator location.
*/
- const DataType & getData() const { return _leaf.getData(); }
+ const DataType & getData() const noexcept { return _leaf.getData(); }
/**
* Check if iterator is at a valid element, i.e. not at end.
*/
- bool valid() const { return _leaf.valid(); }
+ bool valid() const noexcept{ return _leaf.valid(); }
/**
* Return the number of elements in the tree.
*/
- size_t size() const;
+ size_t size() const noexcept;
/**
@@ -333,7 +333,7 @@ public:
/**
* Return if the tree has data or not (e.g. keys and data or only keys).
*/
- static bool hasData() { return LeafNodeType::hasData(); }
+ static bool hasData() noexcept { return LeafNodeType::hasData(); }
/**
* Move the iterator directly to end. Used by findHelper method in BTree.
diff --git a/vespalib/src/vespa/vespalib/btree/btreeiterator.hpp b/vespalib/src/vespa/vespalib/btree/btreeiterator.hpp
index b7927feaa1a..d6dda0047ce 100644
--- a/vespalib/src/vespa/vespalib/btree/btreeiterator.hpp
+++ b/vespalib/src/vespa/vespalib/btree/btreeiterator.hpp
@@ -387,15 +387,13 @@ position(uint32_t levels) const
res += inode->validLeaves();
for (uint32_t c = elem.getIdx(); c < slots; ++c) {
BTreeNode::Ref node = inode->getChild(c);
- const InternalNodeType *jnode =
- _allocator->mapInternalRef(node);
+ const InternalNodeType *jnode = _allocator->mapInternalRef(node);
res -= jnode->validLeaves();
}
} else {
for (uint32_t c = 0; c < elem.getIdx(); ++c) {
BTreeNode::Ref node = inode->getChild(c);
- const InternalNodeType *jnode =
- _allocator->mapInternalRef(node);
+ const InternalNodeType *jnode = _allocator->mapInternalRef(node);
res += jnode->validLeaves();
}
}
@@ -484,7 +482,7 @@ template <typename KeyT, typename DataT, typename AggrT,
uint32_t INTERNAL_SLOTS, uint32_t LEAF_SLOTS, uint32_t PATH_SIZE>
size_t
BTreeIteratorBase<KeyT, DataT, AggrT, INTERNAL_SLOTS, LEAF_SLOTS, PATH_SIZE>::
-size() const
+size() const noexcept
{
if (_pathSize > 0) {
return _path[_pathSize - 1].getNode()->validLeaves();
diff --git a/vespalib/src/vespa/vespalib/btree/btreenode.h b/vespalib/src/vespa/vespalib/btree/btreenode.h
index 0a77a0b4685..4931021d771 100644
--- a/vespalib/src/vespa/vespalib/btree/btreenode.h
+++ b/vespalib/src/vespa/vespalib/btree/btreenode.h
@@ -67,14 +67,14 @@ public:
using Ref = datastore::EntryRef;
using ChildRef = datastore::AtomicEntryRef;
- bool isLeaf() const { return _level == 0u; }
- bool getFrozen() const { return _isFrozen; }
- void freeze() { _isFrozen = true; }
- void unFreeze() { _isFrozen = false; }
- void setLevel(uint8_t level) { _level = level; }
- uint32_t getLevel() const { return _level; }
- uint32_t validSlots() const { return _validSlots; }
- void setValidSlots(uint16_t validSlots_) { _validSlots = validSlots_; }
+ bool isLeaf() const noexcept { return _level == 0u; }
+ bool getFrozen() const noexcept { return _isFrozen; }
+ void freeze() noexcept { _isFrozen = true; }
+ void unFreeze() noexcept { _isFrozen = false; }
+ void setLevel(uint8_t level) noexcept { _level = level; }
+ uint32_t getLevel() const noexcept { return _level; }
+ uint32_t validSlots() const noexcept { return _validSlots; }
+ void setValidSlots(uint16_t validSlots_) noexcept { _validSlots = validSlots_; }
};
@@ -358,7 +358,7 @@ public:
void insert(uint32_t idx, const KeyT & key, BTreeNode::Ref child) {
insert(idx, key, BTreeNode::ChildRef(child));
}
- uint32_t validLeaves() const { return _validLeaves; }
+ uint32_t validLeaves() const noexcept { return _validLeaves; }
void setValidLeaves(uint32_t newValidLeaves) { _validLeaves = newValidLeaves; }
void incValidLeaves(uint32_t delta) { _validLeaves += delta; }
void decValidLeaves(uint32_t delta) { _validLeaves -= delta; }
diff --git a/vespalib/src/vespa/vespalib/data/memorydatastore.cpp b/vespalib/src/vespa/vespalib/data/memorydatastore.cpp
index 354787690c2..6d483e6ff4e 100644
--- a/vespalib/src/vespa/vespalib/data/memorydatastore.cpp
+++ b/vespalib/src/vespa/vespalib/data/memorydatastore.cpp
@@ -41,21 +41,4 @@ MemoryDataStore::push_back(const void * data, const size_t sz)
return ref;
}
-VariableSizeVector::VariableSizeVector(size_t initialCount, size_t initialBufferSize)
- : _vector(),
- _store(Alloc::alloc(initialBufferSize))
-{
- _vector.reserve(initialCount);
-}
-
-VariableSizeVector::~VariableSizeVector() = default;
-
-VariableSizeVector::Reference
-VariableSizeVector::push_back(const void * data, const size_t sz)
-{
- MemoryDataStore::Reference ptr(_store.push_back(data, sz));
- _vector.push_back(Reference(ptr.data(), sz));
- return _vector.back();
-}
-
} // namespace vespalib
diff --git a/vespalib/src/vespa/vespalib/data/memorydatastore.h b/vespalib/src/vespa/vespalib/data/memorydatastore.h
index 7022eb88051..6691211cdd0 100644
--- a/vespalib/src/vespa/vespalib/data/memorydatastore.h
+++ b/vespalib/src/vespa/vespalib/data/memorydatastore.h
@@ -13,18 +13,19 @@ namespace vespalib {
* It has the important property that once an object has been allocated it does not move in memory.
* It will start of by allocating one backing buffer and items stored will be appended here.
* When limit is exceeded a new buffer is allocated with twice the size of the previous and so it goes.
+ * You can also provide an optional lock to make it thread safe.
**/
class MemoryDataStore {
public:
class Reference {
public:
- Reference(void * data_) noexcept : _data(data_) { }
+ explicit Reference(void * data_) noexcept : _data(data_) { }
void * data() noexcept { return _data; }
const char * c_str() const noexcept { return static_cast<const char *>(_data); }
private:
void * _data;
};
- MemoryDataStore(alloc::Alloc && initialAlloc=alloc::Alloc::alloc(256), std::mutex * lock=nullptr);
+ MemoryDataStore(alloc::Alloc && initialAlloc, std::mutex * lock);
MemoryDataStore(const MemoryDataStore &) = delete;
MemoryDataStore & operator = (const MemoryDataStore &) = delete;
~MemoryDataStore();
@@ -33,7 +34,7 @@ public:
* for the lifetime of this object.
* @return A pointer/reference to the freshly stored object.
*/
- Reference push_back(const void * data, const size_t sz);
+ Reference push_back(const void * data, size_t sz);
void swap(MemoryDataStore & rhs) { _buffers.swap(rhs._buffers); }
void clear() noexcept {
_buffers.clear();
@@ -44,83 +45,5 @@ private:
std::mutex * _lock;
};
-class VariableSizeVector
-{
-public:
- class Reference {
- public:
- Reference(void * data_, size_t sz) noexcept : _data(data_), _sz(sz) { }
- void * data() noexcept { return _data; }
- const char * c_str() const noexcept { return static_cast<const char *>(_data); }
- size_t size() const noexcept { return _sz; }
- private:
- void * _data;
- size_t _sz;
- };
- class iterator {
- public:
- iterator(vespalib::Array<Reference> & v, size_t index) noexcept : _vector(&v), _index(index) {}
- Reference & operator * () const noexcept { return (*_vector)[_index]; }
- Reference * operator -> () const noexcept { return &(*_vector)[_index]; }
- iterator & operator ++ () noexcept {
- _index++;
- return *this;
- }
- iterator operator ++ (int) noexcept {
- iterator prev = *this;
- ++(*this);
- return prev;
- }
- bool operator==(const iterator& rhs) const noexcept { return (_index == rhs._index); }
- bool operator!=(const iterator& rhs) const noexcept { return (_index != rhs._index); }
- private:
- vespalib::Array<Reference> * _vector;
- size_t _index;
- };
- class const_iterator {
- public:
- const_iterator(const vespalib::Array<Reference> & v, size_t index) noexcept : _vector(&v), _index(index) {}
- const Reference & operator * () const noexcept { return (*_vector)[_index]; }
- const Reference * operator -> () const noexcept { return &(*_vector)[_index]; }
- const_iterator & operator ++ () noexcept {
- _index++;
- return *this;
- }
- const_iterator operator ++ (int) noexcept {
- const_iterator prev = *this;
- ++(*this);
- return prev;
- }
- bool operator==(const const_iterator& rhs) const noexcept { return (_index == rhs._index); }
- bool operator!=(const const_iterator& rhs) const noexcept { return (_index != rhs._index); }
- private:
- const vespalib::Array<Reference> * _vector;
- size_t _index;
- };
- VariableSizeVector(const VariableSizeVector &) = delete;
- VariableSizeVector & operator = (const VariableSizeVector &) = delete;
- VariableSizeVector(size_t initialCount, size_t initialBufferSize);
- ~VariableSizeVector();
- iterator begin() noexcept { return iterator(_vector, 0); }
- iterator end() noexcept { return iterator(_vector, size()); }
- const_iterator begin() const noexcept { return const_iterator(_vector, 0); }
- const_iterator end() const noexcept { return const_iterator(_vector, size()); }
- Reference push_back(const void * data, const size_t sz);
- Reference operator [] (uint32_t index) const noexcept { return _vector[index]; }
- size_t size() const noexcept { return _vector.size(); }
- bool empty() const noexcept { return _vector.empty(); }
- void swap(VariableSizeVector & rhs) noexcept {
- _vector.swap(rhs._vector);
- _store.swap(rhs._store);
- }
- void clear() {
- _vector.clear();
- _store.clear();
- }
-private:
- vespalib::Array<Reference> _vector;
- MemoryDataStore _store;
-};
-
} // namespace vespalib
diff --git a/vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h b/vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h
index 44dbf05c9da..fb4bc3943ae 100644
--- a/vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h
+++ b/vespalib/src/vespa/vespalib/data/slime/named_symbol_lookup.h
@@ -20,7 +20,7 @@ private:
const Memory &_name;
public:
- NamedSymbolLookup(const SymbolTable &table, const Memory &name)
+ NamedSymbolLookup(const SymbolTable &table, const Memory &name) noexcept
: _table(table), _name(name) {}
Symbol lookup() const override;
};
diff --git a/vespalib/src/vespa/vespalib/data/slime/slime.cpp b/vespalib/src/vespa/vespalib/data/slime/slime.cpp
index d6fac44b360..5a9ec54584d 100644
--- a/vespalib/src/vespa/vespalib/data/slime/slime.cpp
+++ b/vespalib/src/vespa/vespalib/data/slime/slime.cpp
@@ -6,16 +6,6 @@
namespace vespalib {
-Slime::Params::Params() : Params(std::make_unique<SymbolTable>()) { }
-Slime::Params::Params(std::unique_ptr<SymbolTable> symbols) noexcept : _symbols(std::move(symbols)), _chunkSize(4096) { }
-Slime::Params::Params(Params &&) noexcept = default;
-Slime::Params::~Params() = default;
-
-std::unique_ptr<slime::SymbolTable>
-Slime::Params::detachSymbols() {
- return std::move(_symbols);
-}
-
Slime::Slime(Params params)
: _names(params.detachSymbols()),
_stash(std::make_unique<Stash>(params.getChunkSize())),
@@ -33,26 +23,6 @@ Slime::reclaimSymbols(Slime &&rhs) {
return std::move(rhs._names);
}
-size_t
-Slime::symbols() const noexcept {
- return _names->symbols();
-}
-
-Memory
-Slime::inspect(Symbol symbol) const {
- return _names->inspect(symbol);
-}
-
-slime::Symbol
-Slime::insert(Memory name) {
- return _names->insert(name);
-}
-
-slime::Symbol
-Slime::lookup(Memory name) const {
- return _names->lookup(name);
-}
-
bool operator == (const Slime & a, const Slime & b) noexcept
{
return a.get() == b.get();
diff --git a/vespalib/src/vespa/vespalib/data/slime/slime.h b/vespalib/src/vespa/vespalib/data/slime/slime.h
index a426f906563..4b789838009 100644
--- a/vespalib/src/vespa/vespalib/data/slime/slime.h
+++ b/vespalib/src/vespa/vespalib/data/slime/slime.h
@@ -21,6 +21,7 @@
#include "symbol.h"
#include "symbol_inserter.h"
#include "symbol_lookup.h"
+#include "symbol_table.h"
#include "type.h"
#include "value.h"
#include "value_factory.h"
@@ -51,32 +52,30 @@ private:
using Cursor = slime::Cursor;
using Inspector = slime::Inspector;
- std::unique_ptr<SymbolTable> _names;
- std::unique_ptr<Stash> _stash;
- RootValue _root;
+ std::unique_ptr<SymbolTable> _names;
+ std::unique_ptr<Stash> _stash;
+ RootValue _root;
public:
using UP = std::unique_ptr<Slime>;
class Params {
private:
- std::unique_ptr<SymbolTable> _symbols;
- size_t _chunkSize;
+ std::unique_ptr<SymbolTable> _symbols;
+ size_t _chunkSize;
public:
- Params();
- explicit Params(std::unique_ptr<SymbolTable> symbols) noexcept;
- Params(Params &&) noexcept;
- ~Params();
- Params & setChunkSize(size_t chunkSize) {
- _chunkSize = chunkSize;
- return *this;
- }
- size_t getChunkSize() const { return _chunkSize; }
- std::unique_ptr<SymbolTable> detachSymbols();
+ Params() : Params(4096) {}
+ explicit Params(size_t chunkSize) : _symbols(std::make_unique<SymbolTable>()), _chunkSize(chunkSize) {}
+ explicit Params(std::unique_ptr<SymbolTable> symbols) noexcept : _symbols(std::move(symbols)), _chunkSize(4096) {}
+ Params(Params &&) noexcept = default;
+ ~Params() = default;
+ size_t getChunkSize() const noexcept { return _chunkSize; }
+ std::unique_ptr<SymbolTable> detachSymbols() noexcept { return std::move(_symbols); }
};
/**
* Construct an initially empty Slime object.
**/
- explicit Slime(Params params = Params());
+ explicit Slime() : Slime(Params()) {}
+ explicit Slime(Params params);
~Slime();
@@ -88,13 +87,13 @@ public:
static std::unique_ptr<SymbolTable> reclaimSymbols(Slime &&rhs);
- size_t symbols() const noexcept;
+ size_t symbols() const noexcept { return _names->symbols(); }
- Memory inspect(Symbol symbol) const;
+ Memory inspect(Symbol symbol) const { return _names->inspect(symbol); }
- Symbol insert(Memory name);
+ Symbol insert(Memory name) { return _names->insert(name); }
- Symbol lookup(Memory name) const;
+ Symbol lookup(Memory name) const { return _names->lookup(name); }
Cursor &get() noexcept { return _root.get(); }
diff --git a/vespalib/src/vespa/vespalib/data/slime/symbol.h b/vespalib/src/vespa/vespalib/data/slime/symbol.h
index 3bce727fad9..a60a49fda27 100644
--- a/vespalib/src/vespa/vespalib/data/slime/symbol.h
+++ b/vespalib/src/vespa/vespalib/data/slime/symbol.h
@@ -19,8 +19,8 @@ private:
public:
Symbol() noexcept : _value(UNDEFINED) {}
Symbol(uint32_t v) noexcept : _value(v) {}
- bool undefined() const { return (_value == UNDEFINED); }
- uint32_t getValue() const { return _value; }
+ bool undefined() const noexcept { return (_value == UNDEFINED); }
+ uint32_t getValue() const noexcept { return _value; }
bool operator<(const Symbol &rhs) const noexcept { return (_value < rhs._value); }
bool operator==(const Symbol &rhs) const noexcept { return (_value == rhs._value); }
};
diff --git a/vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp b/vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp
index a3313516c64..dffe35707fc 100644
--- a/vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp
+++ b/vespalib/src/vespa/vespalib/data/slime/symbol_table.cpp
@@ -5,10 +5,13 @@
namespace vespalib::slime {
-SymbolTable::SymbolTable(size_t expectedNumSymbols) :
- _symbols(3*expectedNumSymbols),
- _names(expectedNumSymbols, expectedNumSymbols*16)
-{ }
+SymbolTable::SymbolTable(size_t expectedNumSymbols)
+ : _stash(),
+ _symbols(3*expectedNumSymbols),
+ _names()
+{
+ _names.reserve(expectedNumSymbols);
+}
SymbolTable::~SymbolTable() = default;
@@ -16,6 +19,7 @@ void
SymbolTable::clear() {
_names.clear();
_symbols.clear();
+ _stash.clear();
}
Symbol
@@ -23,17 +27,21 @@ SymbolTable::insert(const Memory &name) {
SymbolMap::const_iterator pos = _symbols.find(name);
if (pos == _symbols.end()) {
Symbol symbol(_names.size());
- SymbolVector::Reference r(_names.push_back(name.data, name.size));
- _symbols.insert(std::make_pair(Memory(r.c_str(), r.size()), symbol));
+ char *buf = _stash.alloc(name.size);
+ memcpy(buf, name.data, name.size);
+ Memory backed(buf, name.size);
+ _names.push_back(backed);
+ _symbols.insert(std::make_pair(backed, symbol));
return symbol;
}
return pos->second;
}
+
Symbol
SymbolTable::lookup(const Memory &name) const {
SymbolMap::const_iterator pos = _symbols.find(name);
if (pos == _symbols.end()) {
- return Symbol();
+ return {};
}
return pos->second;
}
diff --git a/vespalib/src/vespa/vespalib/data/slime/symbol_table.h b/vespalib/src/vespa/vespalib/data/slime/symbol_table.h
index c5f3cf12fd6..0eae65cead0 100644
--- a/vespalib/src/vespa/vespalib/data/slime/symbol_table.h
+++ b/vespalib/src/vespa/vespalib/data/slime/symbol_table.h
@@ -4,8 +4,8 @@
#include "symbol.h"
#include <vespa/vespalib/data/memory.h>
+#include <vespa/vespalib/util/stash.h>
#include <vespa/vespalib/stllike/hash_map.h>
-#include <vespa/vespalib/data/memorydatastore.h>
namespace vespalib::slime {
@@ -21,21 +21,24 @@ private:
}
};
using SymbolMap = hash_map<Memory, Symbol, hasher>;
- using SymbolVector = VariableSizeVector;
+ using SymbolVector = std::vector<Memory>;
+ Stash _stash;
SymbolMap _symbols;
SymbolVector _names;
public:
using UP = std::unique_ptr<SymbolTable>;
- SymbolTable(size_t expectedNumSymbols=16);
+ SymbolTable() : SymbolTable(16) {}
+ explicit SymbolTable(size_t expectedNumSymbols);
+ SymbolTable(SymbolTable &&) noexcept = default;
+ SymbolTable & operator=(SymbolTable &&) noexcept = default;
~SymbolTable();
size_t symbols() const noexcept { return _names.size(); }
Memory inspect(const Symbol &symbol) const {
if (symbol.getValue() > _names.size()) {
return Memory();
}
- SymbolVector::Reference r(_names[symbol.getValue()]);
- return Memory(r.c_str(), r.size());
+ return _names[symbol.getValue()];
}
Symbol insert(const Memory &name);
Symbol lookup(const Memory &name) const;
diff --git a/vespalib/src/vespa/vespalib/datastore/compaction_strategy.h b/vespalib/src/vespa/vespalib/datastore/compaction_strategy.h
index dd41ed244ae..d3cdc174339 100644
--- a/vespalib/src/vespa/vespalib/datastore/compaction_strategy.h
+++ b/vespalib/src/vespa/vespalib/datastore/compaction_strategy.h
@@ -2,8 +2,9 @@
#pragma once
-#include <iosfwd>
+#include <cstddef>
#include <cstdint>
+#include <iosfwd>
namespace vespalib {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
index 5e8d29980cd..8ccef84d969 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(vespalib_vespalib_fuzzy OBJECT
implicit_levenshtein_dfa.cpp
levenshtein_dfa.cpp
levenshtein_distance.cpp
+ table_dfa.cpp
unicode_utils.cpp
DEPENDS
)
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp
index 826b0beffd6..d87afef1fbe 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp
@@ -9,6 +9,7 @@ namespace {
const vespalib::string brute_force = "brute_force";
const vespalib::string dfa_implicit = "dfa_implicit";
const vespalib::string dfa_explicit = "dfa_explicit";
+const vespalib::string dfa_table = "dfa_table";
}
@@ -22,6 +23,8 @@ to_string(FuzzyMatchingAlgorithm algo)
return dfa_implicit;
case FuzzyMatchingAlgorithm::DfaExplicit:
return dfa_explicit;
+ case FuzzyMatchingAlgorithm::DfaTable:
+ return dfa_table;
default:
return "";
}
@@ -37,6 +40,8 @@ fuzzy_matching_algorithm_from_string(const vespalib::string& algo,
return FuzzyMatchingAlgorithm::DfaImplicit;
} else if (algo == dfa_explicit) {
return FuzzyMatchingAlgorithm::DfaExplicit;
+ } else if (algo == dfa_table) {
+ return FuzzyMatchingAlgorithm::DfaTable;
}
return default_algo;
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h
index 83cb121fe5f..9af94e84b89 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h
@@ -13,7 +13,8 @@ namespace vespalib {
enum class FuzzyMatchingAlgorithm {
BruteForce,
DfaImplicit,
- DfaExplicit
+ DfaExplicit,
+ DfaTable
};
vespalib::string to_string(FuzzyMatchingAlgorithm algo);
diff --git a/vespalib/src/vespa/vespalib/fuzzy/inline_tfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/inline_tfa.hpp
new file mode 100644
index 00000000000..2475704ec06
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/inline_tfa.hpp
@@ -0,0 +1,91 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// start of auto-generated code for N = 1
+template <> struct InlineTfa<1> {
+ static constexpr Transition table[6][8] = {
+ {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+ {{0,2},{0,2},{0,3},{0,3},{1,1},{1,1},{1,1},{1,1}},
+ {{0,0},{0,0},{2,4},{2,4},{1,4},{1,4},{1,2},{1,2}},
+ {{0,0},{3,4},{2,4},{2,2},{1,4},{1,5},{1,2},{1,3}},
+ {{0,0},{0,0},{0,0},{0,0},{1,4},{1,4},{1,4},{1,4}},
+ {{0,0},{3,4},{0,0},{3,4},{1,4},{1,5},{1,4},{1,5}}
+ };
+ static constexpr uint8_t edits[6][3] = {
+ {2,2,2},
+ {0,1,2},
+ {1,1,2},
+ {1,1,1},
+ {1,2,2},
+ {1,2,1}
+ };
+};
+// end of auto-generated code for N = 1
+// start of auto-generated code for N = 2
+template <> struct InlineTfa<2> {
+ static constexpr Transition table[31][32] = {
+ {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+ {{0,2},{0,2},{0,2},{0,2},{0,3},{0,3},{0,3},{0,3},{0,4},{0,4},{0,4},{0,4},{0,4},{0,4},{0,4},{0,4},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
+ {{0,5},{0,5},{0,5},{0,5},{0,6},{0,6},{0,6},{0,6},{0,7},{0,7},{0,7},{0,7},{0,7},{0,7},{0,7},{0,7},{1,8},{1,8},{1,8},{1,8},{1,9},{1,9},{1,9},{1,9},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2}},
+ {{0,5},{0,5},{0,10},{0,10},{0,6},{0,6},{0,11},{0,11},{0,7},{0,7},{0,12},{0,12},{0,7},{0,7},{0,12},{0,12},{1,8},{1,8},{1,13},{1,13},{1,9},{1,9},{1,14},{1,14},{1,2},{1,2},{1,3},{1,3},{1,2},{1,2},{1,3},{1,3}},
+ {{0,6},{0,6},{0,11},{0,11},{0,15},{0,15},{0,15},{0,15},{0,7},{0,7},{0,12},{0,12},{0,16},{0,16},{0,16},{0,16},{1,9},{1,9},{1,14},{1,14},{1,17},{1,17},{1,17},{1,17},{1,2},{1,2},{1,3},{1,3},{1,4},{1,4},{1,4},{1,4}},
+ {{0,0},{0,0},{0,0},{0,0},{3,18},{3,18},{3,18},{3,18},{2,18},{2,18},{2,18},{2,18},{2,19},{2,19},{2,19},{2,19},{1,18},{1,18},{1,18},{1,18},{1,20},{1,20},{1,20},{1,20},{1,19},{1,19},{1,19},{1,19},{1,5},{1,5},{1,5},{1,5}},
+ {{0,0},{0,0},{4,18},{4,18},{3,18},{3,18},{3,19},{3,19},{2,18},{2,18},{2,20},{2,20},{2,19},{2,19},{2,5},{2,5},{1,18},{1,18},{1,21},{1,21},{1,20},{1,20},{1,22},{1,22},{1,19},{1,19},{1,23},{1,23},{1,5},{1,5},{1,6},{1,6}},
+ {{2,19},{2,19},{2,5},{2,5},{3,8},{3,8},{3,8},{3,8},{2,19},{2,19},{2,5},{2,5},{3,8},{3,8},{3,8},{3,8},{1,5},{1,5},{1,6},{1,6},{1,7},{1,7},{1,7},{1,7},{1,5},{1,5},{1,6},{1,6},{1,7},{1,7},{1,7},{1,7}},
+ {{0,19},{0,19},{0,19},{0,19},{0,19},{0,19},{0,19},{0,19},{0,5},{0,5},{0,5},{0,5},{0,5},{0,5},{0,5},{0,5},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8},{1,8}},
+ {{0,19},{0,19},{0,19},{0,19},{0,23},{0,23},{0,23},{0,23},{0,5},{0,5},{0,5},{0,5},{0,6},{0,6},{0,6},{0,6},{1,8},{1,8},{1,8},{1,8},{1,9},{1,9},{1,9},{1,9},{1,8},{1,8},{1,8},{1,8},{1,9},{1,9},{1,9},{1,9}},
+ {{0,0},{5,18},{0,0},{5,18},{3,18},{3,20},{3,18},{3,20},{2,18},{2,21},{2,18},{2,21},{2,19},{2,23},{2,19},{2,23},{1,18},{1,24},{1,18},{1,24},{1,20},{1,25},{1,20},{1,25},{1,19},{1,26},{1,19},{1,26},{1,5},{1,10},{1,5},{1,10}},
+ {{0,0},{5,18},{4,18},{4,19},{3,18},{3,20},{3,19},{3,5},{2,18},{2,21},{2,20},{2,22},{2,19},{2,23},{2,5},{2,6},{1,18},{1,24},{1,21},{1,27},{1,20},{1,25},{1,22},{1,28},{1,19},{1,26},{1,23},{1,29},{1,5},{1,10},{1,6},{1,11}},
+ {{2,19},{2,23},{2,5},{2,6},{3,8},{3,9},{3,8},{3,9},{2,19},{2,23},{2,5},{2,6},{3,8},{3,9},{3,8},{3,9},{1,5},{1,10},{1,6},{1,11},{1,7},{1,12},{1,7},{1,12},{1,5},{1,10},{1,6},{1,11},{1,7},{1,12},{1,7},{1,12}},
+ {{0,19},{0,19},{0,26},{0,26},{0,19},{0,19},{0,26},{0,26},{0,5},{0,5},{0,10},{0,10},{0,5},{0,5},{0,10},{0,10},{1,8},{1,8},{1,13},{1,13},{1,8},{1,8},{1,13},{1,13},{1,8},{1,8},{1,13},{1,13},{1,8},{1,8},{1,13},{1,13}},
+ {{0,19},{0,19},{0,26},{0,26},{0,23},{0,23},{0,29},{0,29},{0,5},{0,5},{0,10},{0,10},{0,6},{0,6},{0,11},{0,11},{1,8},{1,8},{1,13},{1,13},{1,9},{1,9},{1,14},{1,14},{1,8},{1,8},{1,13},{1,13},{1,9},{1,9},{1,14},{1,14}},
+ {{3,19},{3,5},{4,8},{4,8},{3,19},{3,5},{4,8},{4,8},{2,5},{2,6},{2,7},{2,7},{2,5},{2,6},{2,7},{2,7},{1,22},{1,28},{1,30},{1,30},{1,22},{1,28},{1,30},{1,30},{1,6},{1,11},{1,15},{1,15},{1,6},{1,11},{1,15},{1,15}},
+ {{2,5},{2,6},{2,7},{2,7},{3,8},{3,9},{3,2},{3,2},{2,5},{2,6},{2,7},{2,7},{3,8},{3,9},{3,2},{3,2},{1,6},{1,11},{1,15},{1,15},{1,7},{1,12},{1,16},{1,16},{1,6},{1,11},{1,15},{1,15},{1,7},{1,12},{1,16},{1,16}},
+ {{0,6},{0,6},{0,11},{0,11},{0,15},{0,15},{0,15},{0,15},{0,6},{0,6},{0,11},{0,11},{0,15},{0,15},{0,15},{0,15},{1,9},{1,9},{1,14},{1,14},{1,17},{1,17},{1,17},{1,17},{1,9},{1,9},{1,14},{1,14},{1,17},{1,17},{1,17},{1,17}},
+ {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18}},
+ {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{2,18},{2,18},{2,18},{2,18},{2,18},{2,18},{2,18},{2,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,18},{1,19},{1,19},{1,19},{1,19},{1,19},{1,19},{1,19},{1,19}},
+ {{0,0},{0,0},{0,0},{0,0},{3,18},{3,18},{3,18},{3,18},{0,0},{0,0},{0,0},{0,0},{3,18},{3,18},{3,18},{3,18},{1,18},{1,18},{1,18},{1,18},{1,20},{1,20},{1,20},{1,20},{1,18},{1,18},{1,18},{1,18},{1,20},{1,20},{1,20},{1,20}},
+ {{0,0},{0,0},{4,18},{4,18},{0,0},{0,0},{4,18},{4,18},{0,0},{0,0},{4,18},{4,18},{0,0},{0,0},{4,18},{4,18},{1,18},{1,18},{1,21},{1,21},{1,18},{1,18},{1,21},{1,21},{1,18},{1,18},{1,21},{1,21},{1,18},{1,18},{1,21},{1,21}},
+ {{0,0},{0,0},{4,18},{4,18},{3,18},{3,18},{3,19},{3,19},{0,0},{0,0},{4,18},{4,18},{3,18},{3,18},{3,19},{3,19},{1,18},{1,18},{1,21},{1,21},{1,20},{1,20},{1,22},{1,22},{1,18},{1,18},{1,21},{1,21},{1,20},{1,20},{1,22},{1,22}},
+ {{0,0},{0,0},{4,18},{4,18},{0,0},{0,0},{4,18},{4,18},{2,18},{2,18},{2,20},{2,20},{2,18},{2,18},{2,20},{2,20},{1,18},{1,18},{1,21},{1,21},{1,18},{1,18},{1,21},{1,21},{1,19},{1,19},{1,23},{1,23},{1,19},{1,19},{1,23},{1,23}},
+ {{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24}},
+ {{0,0},{5,18},{0,0},{5,18},{3,18},{3,20},{3,18},{3,20},{0,0},{5,18},{0,0},{5,18},{3,18},{3,20},{3,18},{3,20},{1,18},{1,24},{1,18},{1,24},{1,20},{1,25},{1,20},{1,25},{1,18},{1,24},{1,18},{1,24},{1,20},{1,25},{1,20},{1,25}},
+ {{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{0,0},{5,18},{2,18},{2,21},{2,18},{2,21},{2,18},{2,21},{2,18},{2,21},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,18},{1,24},{1,19},{1,26},{1,19},{1,26},{1,19},{1,26},{1,19},{1,26}},
+ {{0,0},{5,18},{4,18},{4,19},{0,0},{5,18},{4,18},{4,19},{0,0},{5,18},{4,18},{4,19},{0,0},{5,18},{4,18},{4,19},{1,18},{1,24},{1,21},{1,27},{1,18},{1,24},{1,21},{1,27},{1,18},{1,24},{1,21},{1,27},{1,18},{1,24},{1,21},{1,27}},
+ {{0,0},{5,18},{4,18},{4,19},{3,18},{3,20},{3,19},{3,5},{0,0},{5,18},{4,18},{4,19},{3,18},{3,20},{3,19},{3,5},{1,18},{1,24},{1,21},{1,27},{1,20},{1,25},{1,22},{1,28},{1,18},{1,24},{1,21},{1,27},{1,20},{1,25},{1,22},{1,28}},
+ {{0,0},{5,18},{4,18},{4,19},{0,0},{5,18},{4,18},{4,19},{2,18},{2,21},{2,20},{2,22},{2,18},{2,21},{2,20},{2,22},{1,18},{1,24},{1,21},{1,27},{1,18},{1,24},{1,21},{1,27},{1,19},{1,26},{1,23},{1,29},{1,19},{1,26},{1,23},{1,29}},
+ {{3,19},{3,5},{4,8},{4,8},{3,19},{3,5},{4,8},{4,8},{3,19},{3,5},{4,8},{4,8},{3,19},{3,5},{4,8},{4,8},{1,22},{1,28},{1,30},{1,30},{1,22},{1,28},{1,30},{1,30},{1,22},{1,28},{1,30},{1,30},{1,22},{1,28},{1,30},{1,30}}
+ };
+ static constexpr uint8_t edits[31][5] = {
+ {3,3,3,3,3},
+ {0,1,2,3,3},
+ {1,1,2,3,3},
+ {1,1,2,2,3},
+ {1,1,1,2,3},
+ {2,2,2,3,3},
+ {2,2,2,2,3},
+ {2,2,1,2,3},
+ {1,2,3,3,3},
+ {1,2,2,3,3},
+ {2,2,2,3,2},
+ {2,2,2,2,2},
+ {2,2,1,2,2},
+ {1,2,3,2,3},
+ {1,2,2,2,3},
+ {2,2,2,1,2},
+ {2,2,1,1,2},
+ {1,2,1,2,3},
+ {2,3,3,3,3},
+ {2,2,3,3,3},
+ {2,3,2,3,3},
+ {2,3,3,2,3},
+ {2,3,2,2,3},
+ {2,2,3,2,3},
+ {2,3,3,3,2},
+ {2,3,2,3,2},
+ {2,2,3,3,2},
+ {2,3,3,2,2},
+ {2,3,2,2,2},
+ {2,2,3,2,2},
+ {2,3,2,1,2}
+ };
+};
+// end of auto-generated code for N = 2
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index 1caae408176..5f6d0ae9956 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "explicit_levenshtein_dfa.h"
#include "implicit_levenshtein_dfa.h"
+#include "table_dfa.h"
#include "levenshtein_dfa.h"
#include "unicode_utils.h"
#include <vespa/vespalib/util/stringfmt.h>
@@ -53,14 +54,19 @@ LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max
} else { // max_edits == 2
return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased));
}
- } else { // DfaType::Explicit
+ } else if(dfa_type == DfaType::Explicit) {
if (max_edits == 1) {
return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa();
} else { // max_edits == 2
return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa();
}
+ } else { // DfaType::Table
+ if (max_edits == 1) {
+ return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased));
+ } else { // max_edits == 2
+ return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased));
+ }
}
-
}
LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) {
@@ -87,9 +93,11 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mo
std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) {
if (dt == LevenshteinDfa::DfaType::Implicit) {
os << "Implicit";
- } else {
- assert(dt == LevenshteinDfa::DfaType::Explicit);
+ } else if (dt == LevenshteinDfa::DfaType::Explicit) {
os << "Explicit";
+ } else {
+ assert(dt == LevenshteinDfa::DfaType::Table);
+ os << "Table";
}
return os;
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index c6ca06d4de3..1652631e968 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -58,8 +58,8 @@ namespace vespalib::fuzzy {
* ====== Unicode support ======
*
* Matching and successor generation is fully Unicode-aware. All input strings are expected
- * to be in UTF-8, and the generated successor is also encoded as UTF-8 (with some caveats;
- * see the documentation for match()).
+ * to be in UTF-8, and the generated successor is encoded as UTF-8 (with some caveats; see
+ * the documentation for match()) or UTF-32, depending on the chosen `match()` overload.
*
* Internally, matching is done on UTF-32 code points and the DFA itself is built around
* UTF-32. This is unlike Lucene, which converts a UTF-32 DFA to an equivalent UTF-8 DFA.
@@ -159,7 +159,7 @@ public:
/**
* Attempts to match the source string `source` with the target string this DFA was
- * built with, emitting a successor string on mismatch if `successor_out` != nullptr.
+ * built with.
*
* `source` must not contain any null UTF-8 chars.
*
@@ -179,13 +179,23 @@ public:
* Attempts to match the source string `source` with the target string this DFA was
* built with, emitting a successor string into `successor_out` on mismatch.
*
- * See `match(source)` for semantics of returned MatchResult.
+ * In the case of a _match_, the following holds:
+ *
+ * - The returned MatchResult has the same semantics as `match(source)`.
+ * - `successor_out` has a _prefix_ equal to its value that was originally passed
+ * in at the time of match() being called. The _suffix_ of the string is unspecified,
+ * i.e. it may or may not have been modified.
*
* In the case of a _mismatch_, the following holds:
*
- * - `successor_out` is modified to contain the next (in byte-wise ordering) possible
- * _matching_ string S so that there exists no other matching string S' that is
- * greater than `source` but smaller than S.
+ * - `successor_out` has a _prefix_ equal to its value that was originally passed
+ * in at the time of match() being called.
+ * - `successor_out` has a _suffix_ that contains the next (in byte-wise ordering)
+ * possible _matching_ string S so that there exists no other matching string S'
+ * that is greater than `source` but smaller than S.
+ * The caller must explicitly be aware of any prefixes it sends in, as it is
+ * entirely ignored for the purposes of ordering the successor string vis-a-vis
+ * the input source string.
* - `successor_out` contains UTF-8 bytes that are within what UTF-8 can legally
* encode in bitwise form, but the _code points_ they encode may not be valid.
* In particular, surrogate pair ranges and U+10FFFF+1 may be encoded, neither of
@@ -203,12 +213,8 @@ public:
* is what is passed to the DFA match() function.
*
* Memory allocation:
- * This function does not directly or indirectly allocate any heap memory if either:
- *
- * - the input string is within the max edit distance, or
- * - `successor_out` is nullptr, or
- * - `successor_out` has sufficient capacity to hold the generated successor
- *
+ * This function does not directly or indirectly allocate any heap memory if the
+ * `successor_out` string provided is large enough to fit any generated successor.
* By reusing the successor string across many calls, this therefore amortizes memory
* allocations down to near zero per invocation.
*/
@@ -220,7 +226,8 @@ public:
* internally, and is therefore expected to be more efficient.
*
* The code point ordering of the UTF-32 successor string is identical to that its UTF-8
- * equivalent.
+ * equivalent. This includes the special cases where the successor may contain code points
+ * outside the legal Unicode range.
*/
[[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const;
@@ -231,7 +238,8 @@ public:
enum class DfaType {
Implicit,
- Explicit
+ Explicit,
+ Table
};
/**
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index fb5ec32abc7..9f5dc0fb0c0 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -136,15 +136,14 @@ struct MatchAlgorithm {
* that has nothing in common with the source altogether.
* Example: "gp" -> "hfood" (+1 char value case)
*
- * Performance note:
- * Both the input and successor output strings are in UTF-8 format. To avoid doing
- * duplicate work, we keep track of the byte length of the string prefix that will be
- * part of the successor and simply copy it verbatim instead of building the string
- * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used
- * when one or more of the prefix characters have been lowercase-transformed.
+ * Note for cased vs. uncased matching: when uncased matching is specified, we always
+ * match "as if" both the target and source strings are lowercased. This means that
+ * successor strings are generated based on this form, _not_ on the original form.
+ * Example: uncased matching for target "food" with input "FOXX". This generates the
+ * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
+ * ordering when compared byte-wise against an implicitly lowercased dictionary.
*
* TODO let matcher know if source string is pre-normalized (i.e. lowercased).
- * TODO consider opportunistically appending prefix as we go instead of only when needed.
*/
template <DfaMatcher Matcher, typename SuccessorT>
static MatchResult match(const Matcher& matcher,
@@ -153,22 +152,19 @@ struct MatchAlgorithm {
{
using StateType = typename Matcher::StateType;
Utf8Reader u8_reader(source.data(), source.size());
- uint32_t n_prefix_u8_bytes = 0;
+ uint32_t n_prefix_chars = static_cast<uint32_t>(successor_out.size()); // Don't touch any existing prefix
uint32_t char_after_prefix = 0;
StateType last_state_with_higher_out = StateType{};
- bool can_use_raw_prefix = true;
StateType state = matcher.start();
while (u8_reader.hasMore()) {
- const auto u8_pos_before_char = u8_reader.getPos();
- const uint32_t raw_mch = u8_reader.getChar();
- const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased());
- if (raw_mch != mch) {
- can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix
- }
+ const auto pos_before_char = static_cast<uint32_t>(successor_out.size());
+ const uint32_t raw_mch = u8_reader.getChar();
+ const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased());
+ append_utf32_char(successor_out, mch);
if (matcher.has_higher_out_edge(state, mch)) {
last_state_with_higher_out = state;
- n_prefix_u8_bytes = u8_pos_before_char;
+ n_prefix_chars = pos_before_char;
char_after_prefix = mch;
}
auto maybe_next = matcher.match_input(state, mch);
@@ -176,8 +172,7 @@ struct MatchAlgorithm {
state = maybe_next;
} else {
// Can never match; find the successor
- emit_successor_prefix(successor_out, source, n_prefix_u8_bytes,
- matcher.is_cased() || can_use_raw_prefix);
+ successor_out.resize(n_prefix_chars); // Always <= successor_out.size()
assert(matcher.valid_state(last_state_with_higher_out));
backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out,
char_after_prefix, successor_out);
@@ -188,8 +183,7 @@ struct MatchAlgorithm {
if (edits <= max_edits()) {
return MatchResult::make_match(max_edits(), edits);
}
- emit_successor_prefix(successor_out, source, source.size(),
- matcher.is_cased() || can_use_raw_prefix);
+ // Successor prefix already filled, just need to emit the suffix
emit_smallest_matching_suffix(matcher, state, successor_out);
return MatchResult::make_mismatch(max_edits());
}
@@ -320,48 +314,6 @@ struct MatchAlgorithm {
}
}
- template <typename T>
- static constexpr bool has_8bit_value_type() noexcept {
- return sizeof(typename T::value_type) == 1;
- }
-
- /**
- * The successor prefix is the prefix of the source string up to (but not including) the
- * point where we emit a lexicographically higher character. Ideally we can just copy the
- * UTF-8 bytes verbatim from the source into the successor. This is possible when one of
- * the following holds:
- *
- * - DFA uses Cased (i.e. exact) matching, or
- * - DFA uses Uncased, but none of the characters in the prefix triggered a lowercase
- * transform. This means the prefix is already as-if lowercased, and we can copy it
- * verbatim.
- *
- * In the case that we can't copy verbatim, we currently have to explicitly normalize the
- * prefix by converting it to its lowercased form.
- *
- * Example: Uncased matching for target "food" with input "FOXX". This generates the
- * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
- * ordering when compared byte-wise against an implicitly lowercased dictionary.
- */
- template <typename SuccessorT>
- static void emit_successor_prefix(SuccessorT& successor_out, std::string_view source,
- uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes)
- {
- // TODO redesign prefix output wiring
- if constexpr (has_8bit_value_type<SuccessorT>()) {
- if (emit_raw_prefix_u8_bytes) {
- successor_out = source.substr(0, n_prefix_u8_bytes);
- return;
- }
- }
- // TODO avoid duplicate work...! :I
- successor_out.clear();
- Utf8Reader u8_reader(source.data(), source.size());
- while (u8_reader.getPos() < n_prefix_u8_bytes) {
- append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar()));
- }
- }
-
static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept {
return (is_cased ? in_ch : LowerCase::convert(in_ch));
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
index d20cfc07a9a..dfec0bac4a8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
@@ -112,11 +112,11 @@ std::ostream& operator<<(std::ostream& os, const FixedSparseState<MaxEdits>& s)
if (i != 0) {
os << ", ";
}
- for (size_t j = last_idx; j < s.indices[i]; ++j) {
+ for (size_t j = last_idx; j < s.index(i); ++j) {
os << "-, ";
}
- last_idx = s.indices[i] + 1;
- os << static_cast<uint32_t>(s.costs[i]);
+ last_idx = s.index(i) + 1;
+ os << static_cast<uint32_t>(s.cost(i));
}
os << "]";
return os;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp
new file mode 100644
index 00000000000..943349818fb
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.cpp
@@ -0,0 +1,10 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "table_dfa.hpp"
+
+namespace vespalib::fuzzy {
+
+template class TableDfa<1>;
+template class TableDfa<2>;
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
new file mode 100644
index 00000000000..45c52c84bad
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h
@@ -0,0 +1,63 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "levenshtein_dfa.h"
+#include <vector>
+
+namespace vespalib::fuzzy {
+
+/**
+ * This implementation is based on the paper 'Fast string correction
+ * with Levenshtein automata' from 2002 by Klaus U. Schulz and Stoyan
+ * Mihov.
+ *
+ * Given the maximal distance N, a generic parameterized transition
+ * table is calculated up-front. When a specific word is given, a
+ * simple lookup structure is created to enumerate the possible
+ * characteristic vectors for each position in the given
+ * word. Together, these structures can be used to simulate the
+ * traversal of a hypothetical Levenshtein dfa that will never be
+ * created.
+ *
+ * Approaching the end of the word is handled by padding the
+ * characteristic vectors with 0 bits for everything after the word
+ * ends. In addition, a unit test verifies that there is no possible
+ * sequence of events that leads to the minimal boundary of the state
+ * exceeding the boundary of the word itself. This means that the
+ * simulated dfa can be stepped freely without checking for word size.
+ **/
+template <uint8_t N>
+class TableDfa final : public LevenshteinDfa::Impl
+{
+public:
+ // characteristic vector for a specific input value indicating how
+ // it matches the window starting at the minimal boundary.
+ struct CV {
+ uint32_t input;
+ uint32_t match;
+ CV() noexcept : input(0), match(0) {}
+ };
+ static constexpr size_t window_size() { return 2 * N + 1; }
+ struct Lookup {
+ std::array<CV, window_size()> list;
+ Lookup() noexcept : list() {}
+ };
+
+private:
+ const std::vector<Lookup> _lookup;
+ const bool _is_cased;
+
+ static std::vector<Lookup> make_lookup(const std::vector<uint32_t> &str);
+
+public:
+ using MatchResult = LevenshteinDfa::MatchResult;
+ TableDfa(std::vector<uint32_t> str, bool is_cased);
+ ~TableDfa() override;
+ [[nodiscard]] MatchResult match(std::string_view source) const override;
+ [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override;
+ [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const override;
+ [[nodiscard]] size_t memory_usage() const noexcept override;
+ void dump_as_graphviz(std::ostream& os) const override;
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
new file mode 100644
index 00000000000..de850681113
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
@@ -0,0 +1,586 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "table_dfa.h"
+#include "match_algorithm.hpp"
+#include <vespa/vespalib/util/stringfmt.h>
+#include <cassert>
+#include <stdexcept>
+#include <algorithm>
+#include <map>
+#include <ostream>
+#include <set>
+#include <queue>
+
+namespace vespalib::fuzzy {
+
+namespace {
+
+using vespalib::make_string_short::fmt;
+
+// It is useful to know the number of states compile time to be able
+// to pack lookup tables better.
+template <uint8_t N> constexpr size_t num_states();
+template <> constexpr size_t num_states<1>() { return 6; }
+template <> constexpr size_t num_states<2>() { return 31; }
+template <> constexpr size_t num_states<3>() { return 197; }
+
+template <uint8_t N> constexpr size_t window_size() { return 2 * N + 1; }
+template <uint8_t N> constexpr size_t num_transitions() { return 1 << window_size<N>(); }
+
+
+auto diff(auto a, auto b) { return (a > b) ? (a - b) : (b - a); }
+
+// A Position combines an index into a word being matched with the
+// number of edits needed to get there. This maps directly onto a
+// specific state in the NFA used to match a word. Note that the sort
+// order prefers low edits over low indexs. This is to ensure that a
+// position that subsumes another position will always sort before it.
+struct Position {
+ uint32_t index;
+ uint32_t edits;
+ Position(uint32_t index_in, uint32_t edits_in) noexcept
+ : index(index_in), edits(edits_in) {}
+ static Position start() noexcept { return Position(0,0); }
+ bool subsumes(const Position &rhs) const noexcept {
+ if (edits >= rhs.edits) {
+ return false;
+ }
+ return diff(index, rhs.index) <= (rhs.edits - edits);
+ }
+ Position materialize(uint32_t target_index) const noexcept {
+ return Position(target_index, edits + diff(index, target_index));
+ }
+ bool operator==(const Position &rhs) const noexcept {
+ return (index == rhs.index) && (edits == rhs.edits);
+ }
+ bool operator<(const Position &rhs) const noexcept {
+ return std::tie(edits,index) < std::tie(rhs.edits, rhs.index);
+ }
+ template <uint8_t N>
+ void add_elementary_transitions(const std::vector<bool> &bits, std::vector<Position> &dst) const {
+ assert(bits.size() > index);
+ if (!bits[index]) {
+ dst.emplace_back(index, edits + 1);
+ dst.emplace_back(index + 1, edits + 1);
+ }
+ for (uint32_t e = 0; (edits + e) <= N; ++e) {
+ assert(bits.size() > (index + e));
+ if (bits[index + e]) {
+ dst.emplace_back(index + e + 1, edits + e);
+ }
+ }
+ }
+ vespalib::string to_string() const { return fmt("%u#%u", index, edits); }
+};
+
+// A State is a collection of different Positions that do not subsume
+// each other. If the minimal boundary of a state is larger than 0, it
+// can be lifted from the state in a normalizing operation that will
+// renumber the position indexes such that the minimal boundary of the
+// state becomes 0. This is to allow parameterized states where the
+// general progress of matching the string (minimal boundary of
+// non-normalized state) is untangled from the local competing edit
+// alternatives (normalized state).
+struct State {
+ std::vector<Position> list;
+ State() noexcept : list() {}
+ static State failed() noexcept { return State(); }
+ static State start() {
+ State result;
+ result.list.push_back(Position::start());
+ return result;
+ }
+ bool operator<(const State &rhs) const {
+ return list < rhs.list;
+ }
+ uint32_t minimal_boundary() const noexcept {
+ if (list.empty()) {
+ return 0;
+ }
+ uint32_t min = list[0].index;
+ for (size_t i = 1; i < list.size(); ++i) {
+ min = std::min(min, list[i].index);
+ }
+ return min;
+ }
+ uint32_t normalize() {
+ uint32_t min = minimal_boundary();
+ if (min > 0) {
+ for (auto &entry: list) {
+ entry.index -= min;
+ }
+ }
+ return min;
+ }
+ template <uint8_t N>
+ static State create(std::vector<Position> list_in) {
+ State result;
+ auto want = [&result](Position pos) {
+ if (pos.edits > N) {
+ return false;
+ }
+ for (const auto &old_pos: result.list) {
+ if (old_pos == pos || old_pos.subsumes(pos)) {
+ return false;
+ }
+ }
+ return true;
+ };
+ std::sort(list_in.begin(), list_in.end());
+ for (const auto &pos: list_in) {
+ if (want(pos)) {
+ result.list.push_back(pos);
+ }
+ }
+ return result;
+ }
+ template <uint8_t N>
+ State next(const std::vector<bool> &bits) const {
+ std::vector<Position> tmp;
+ for (const auto &pos: list) {
+ pos.add_elementary_transitions<N>(bits, tmp);
+ }
+ return create<N>(std::move(tmp));
+ }
+ template <uint8_t N>
+ std::vector<uint8_t> make_edit_vector() const {
+ std::vector<uint8_t> result(window_size<N>(), N + 1);
+ for (const auto &pos: list) {
+ for (uint32_t i = 0; i < window_size<N>(); ++i) {
+ result[i] = std::min(result[i], uint8_t(pos.materialize(i).edits));
+ }
+ }
+ return result;
+ }
+ vespalib::string to_string() const {
+ vespalib::string result = "{";
+ for (size_t i = 0; i < list.size(); ++i) {
+ if (i > 0) {
+ result.append(",");
+ }
+ result.append(list[i].to_string());
+ }
+ result.append("}");
+ return result;
+ }
+};
+
+// Keeps track of unique states, assigning an integer value to each
+// state. Only states with minimal boundary 0 is allowed to be
+// inserted into a state repo. Each repo is seeded with the empty
+// state (0) and the start state (1). An assigned integer value can be
+// mapped back into the originating state.
+struct StateRepo {
+ using Map = std::map<State,uint32_t>;
+ using Ref = Map::iterator;
+ Map seen;
+ std::vector<Ref> refs;
+ StateRepo() noexcept : seen(), refs() {
+ auto failed_idx = state_to_idx(State::failed());
+ auto start_idx = state_to_idx(State::start());
+ assert(failed_idx == 0);
+ assert(start_idx == 1);
+ }
+ ~StateRepo();
+ size_t size() const { return seen.size(); }
+ uint32_t state_to_idx(const State &state) {
+ assert(state.minimal_boundary() == 0);
+ uint32_t next = refs.size();
+ auto [pos, inserted] = seen.emplace(state, next);
+ if (inserted) {
+ refs.push_back(pos);
+ }
+ assert(seen.size() == refs.size());
+ return pos->second;
+ }
+ const State &idx_to_state(uint32_t idx) const {
+ assert(idx < refs.size());
+ return refs[idx]->first;
+ }
+};
+[[maybe_unused]] StateRepo::~StateRepo() = default;
+
+template <uint8_t N>
+std::vector<bool> expand_bits(uint32_t value) {
+ static_assert(N < 10);
+ std::vector<bool> result(window_size<N>());
+ uint32_t look_for = num_transitions<N>();
+ assert(value < look_for);
+ for (size_t i = 0; i < result.size(); ++i) {
+ look_for >>= 1;
+ result[i] = (value & look_for);
+ }
+ return result;
+}
+
+template <uint8_t N>
+StateRepo make_state_repo() {
+ StateRepo repo;
+ for (uint32_t idx = 0; idx < repo.size(); ++idx) {
+ const State &state = repo.idx_to_state(idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ State new_state = state.next<N>(expand_bits<N>(i));
+ (void) new_state.normalize();
+ (void) repo.state_to_idx(new_state);
+ }
+ }
+ return repo;
+}
+
+struct Transition {
+ uint8_t step;
+ uint8_t state;
+ constexpr Transition() noexcept : step(0), state(0) {}
+ constexpr Transition(uint8_t di, uint8_t ns) noexcept : step(di), state(ns) {}
+};
+
+template <uint8_t N> struct InlineTfa;
+#include "inline_tfa.hpp"
+
+template <uint8_t N>
+struct Tfa {
+ // what happens when following a transition from a state?
+ std::array<std::array<Transition,num_transitions<N>()>,num_states<N>()> table;
+
+ // how many edits did we use to match the target word?
+ std::array<std::array<uint8_t,window_size<N>()>,num_states<N>()> edits;
+};
+
+template <uint8_t N>
+std::unique_ptr<Tfa<N>> make_tfa() {
+ auto tfa = std::make_unique<Tfa<N>>();
+ StateRepo repo;
+ uint32_t state_idx = 0;
+ for (; state_idx < repo.size(); ++state_idx) {
+ const State &state = repo.idx_to_state(state_idx);
+ for (uint32_t i = 0; i < num_transitions<N>(); ++i) {
+ State new_state = state.next<N>(expand_bits<N>(i));
+ uint32_t step = new_state.normalize();
+ uint32_t new_state_idx = repo.state_to_idx(new_state);
+ assert(step < 256);
+ assert(new_state_idx < 256);
+ tfa->table[state_idx][i].step = step;
+ tfa->table[state_idx][i].state = new_state_idx;
+ }
+ auto edits = state.make_edit_vector<N>();
+ assert(edits.size() == window_size<N>());
+ for (uint32_t i = 0; i < window_size<N>(); ++i) {
+ tfa->edits[state_idx][i] = edits[i];
+ }
+ }
+ assert(repo.size() == num_states<N>());
+ assert(state_idx == num_states<N>());
+ return tfa;
+}
+
+template <typename T>
+vespalib::string format_vector(const std::vector<T> &vector, bool compact = false) {
+ vespalib::string str = compact ? "" : "[";
+ for (size_t i = 0; i < vector.size(); ++i) {
+ if (i > 0 && !compact) {
+ str.append(",");
+ }
+ str.append(fmt("%u", uint32_t(vector[i])));
+ }
+ if (!compact) {
+ str.append("]");
+ }
+ return str;
+}
+
+// A table-based state using the InlineTfa tables to simulate stepping
+// a dfa with max edit distance N. The state itself is represented by
+// a number used as offset into these tables (state). Since the state
+// is parametric, we also store the minimal boundary of the state
+// separately (index).
+template <uint8_t N>
+struct TfaState {
+ uint32_t index;
+ uint32_t state;
+ // needed by dfa matcher concept (should use std::declval instead)
+ constexpr TfaState() noexcept : index(0), state(0) {}
+ constexpr TfaState(uint32_t i, uint32_t s) noexcept : index(i), state(s) {}
+ static constexpr TfaState start() { return TfaState(0, 1); }
+ constexpr bool valid() const noexcept { return state != 0; }
+ constexpr TfaState next(uint32_t bits) const noexcept {
+ const auto &entry = InlineTfa<N>::table[state][bits];
+ return TfaState(index + entry.step, entry.state);
+ }
+ constexpr bool is_valid_edge(uint32_t bits) const noexcept {
+ return InlineTfa<N>::table[state][bits].state != 0;
+ }
+ constexpr uint8_t edits(uint32_t end) const noexcept {
+ uint32_t leap = (end - index);
+ return (leap < window_size<N>()) ? InlineTfa<N>::edits[state][leap] : N + 1;
+ }
+ // for pretty graphviz dumping; minimal possible edits given perfect input from here on
+ constexpr uint32_t min_edits() const noexcept {
+ uint8_t res = N + 1;
+ for (uint32_t i = 0; i < window_size<N>(); ++i) {
+ res = std::min(res, InlineTfa<N>::edits[state][i]);
+ }
+ return res;
+ }
+ // for pretty graphviz dumping; actual edits needed to reach the word end from a valid state
+ constexpr uint32_t exact_edits(uint32_t end) const noexcept {
+ assert(valid());
+ uint32_t res = end;
+ for (uint32_t i = 0; i < window_size<N>(); ++i) {
+ if (uint32_t e = InlineTfa<N>::edits[state][i]; e <= N) {
+ res = std::min(res, e + diff(index + i, end));
+ }
+ }
+ return res;
+ }
+ // for pretty graphviz dumping; enable using in map and set
+ bool operator<(const TfaState &rhs) const noexcept {
+ return std::tie(index, state) < std::tie(rhs.index, rhs.state);
+ }
+ // for pretty graphviz dumping; check for redundant edges
+ bool operator==(const TfaState &rhs) const noexcept {
+ return std::tie(index, state) == std::tie(rhs.index, rhs.state);
+ }
+};
+
+template <uint8_t N>
+struct TableMatcher {
+ using S = TfaState<N>;
+ using StateType = TfaState<N>;
+ using StateParamType = TfaState<N>;
+ using EdgeType = uint32_t;
+
+ const TableDfa<N>::Lookup *lookup;
+ const uint32_t end;
+ const bool cased;
+
+ TableMatcher(const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in)
+ noexcept : lookup(lookup_in), end(end_in), cased(cased_in) {}
+
+ bool is_cased() const noexcept { return cased; }
+ static constexpr S start() noexcept { return S::start(); }
+
+ uint8_t match_edit_distance(S s) const noexcept { return s.edits(end); }
+ bool is_match(S s) const noexcept { return s.edits(end) <= N; }
+
+ static constexpr bool can_match(S s) noexcept { return s.valid(); }
+ static constexpr bool valid_state(S) noexcept { return true; }
+
+ S match_wildcard(S s) const noexcept { return s.next(0); }
+ S match_input(S s, uint32_t c) const noexcept {
+ const auto *slice = lookup[s.index].list.data();
+ for (size_t i = 0; i < window_size<N>() && slice[i].input != 0; ++i) {
+ if (slice[i].input == c) {
+ return s.next(slice[i].match);
+ }
+ }
+ return match_wildcard(s);
+ }
+
+ bool has_higher_out_edge(S s, uint32_t c) const noexcept {
+ if (s.is_valid_edge(0)) {
+ return true;
+ }
+ const auto *slice = lookup[s.index].list.data();
+ for (size_t i = 0; i < window_size<N>() && slice[i].input > c; ++i) {
+ if (s.is_valid_edge(slice[i].match)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool has_exact_explicit_out_edge(S s, uint32_t c) const noexcept {
+ const auto *slice = lookup[s.index].list.data();
+ for (size_t i = 0; i < window_size<N>() && slice[i].input >= c; ++i) {
+ if (slice[i].input == c) {
+ return s.is_valid_edge(slice[i].match);
+ }
+ }
+ return false;
+ }
+
+ uint32_t lowest_higher_explicit_out_edge(S s, uint32_t c) const noexcept {
+ const auto *slice = lookup[s.index].list.data();
+ size_t i = window_size<N>();
+ while (i-- > 0) {
+ if (slice[i].input > c && s.is_valid_edge(slice[i].match)) {
+ return slice[i].input;
+ }
+ }
+ return 0;
+ }
+
+ uint32_t smallest_explicit_out_edge(S s) const noexcept {
+ const auto *slice = lookup[s.index].list.data();
+ size_t i = window_size<N>();
+ while (i-- > 0) {
+ if (slice[i].input != 0 && s.is_valid_edge(slice[i].match)) {
+ return slice[i].input;
+ }
+ }
+ return 0;
+ }
+
+ static constexpr bool valid_edge(uint32_t) noexcept { return true; }
+ static constexpr uint32_t edge_to_u32char(uint32_t c) noexcept { return c; }
+ S edge_to_state(S s, uint32_t c) const noexcept { return match_input(s, c); }
+
+ static constexpr bool implies_exact_match_suffix(S) noexcept { return false; }
+ static constexpr void emit_exact_match_suffix(S, std::vector<uint32_t> &) {} // not called
+ static constexpr void emit_exact_match_suffix(S, std::string &) {} // not called
+};
+
+} // unnamed
+
+template <uint8_t N>
+auto
+TableDfa<N>::make_lookup(const std::vector<uint32_t> &str)->std::vector<Lookup>
+{
+ std::vector<Lookup> result(str.size() + 1);
+ auto have_already = [&](uint32_t c, size_t i)noexcept{
+ for (size_t j = 0; j < window_size(); ++j) {
+ if (result[i].list[j].input == c) {
+ return true;
+ }
+ }
+ return false;
+ };
+ auto make_vector = [&](uint32_t c, size_t i)noexcept{
+ uint32_t bits = 0;
+ for (size_t j = 0; j < window_size(); ++j) {
+ bool found = ((i + j) < str.size()) && (str[i+j] == c);
+ bits = (bits << 1) + found;
+ }
+ return bits;
+ };
+ for (size_t i = 0; i < str.size(); ++i) {
+ for (size_t j = 0; j < window_size(); ++j) {
+ assert(result[i].list[j].input == 0);
+ assert(result[i].list[j].match == 0);
+ if ((i + j) < str.size()) {
+ uint32_t c = str[i + j];
+ if (!have_already(c, i)) {
+ result[i].list[j].input = c;
+ result[i].list[j].match = make_vector(c, i);
+ }
+ }
+ }
+ std::sort(result[i].list.begin(), result[i].list.end(),
+ [](const auto &a, const auto &b){ return a.input > b.input; });
+ }
+ return result;
+}
+
+template <uint8_t N>
+TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased)
+ : _lookup(make_lookup(str)),
+ _is_cased(is_cased)
+{
+}
+
+template <uint8_t N>
+TableDfa<N>::~TableDfa() = default;
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str) const
+{
+ TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+ return MatchAlgorithm<N>::match(matcher, u8str);
+}
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str, std::string& successor_out) const
+{
+ TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+ return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
+}
+
+template <uint8_t N>
+LevenshteinDfa::MatchResult
+TableDfa<N>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const
+{
+ TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased);
+ return MatchAlgorithm<N>::match(matcher, u8str, successor_out);
+}
+
+template <uint8_t N>
+size_t
+TableDfa<N>::memory_usage() const noexcept
+{
+ return _lookup.size() * sizeof(Lookup);
+}
+
+template <uint8_t N>
+void
+TableDfa<N>::dump_as_graphviz(std::ostream &os) const
+{
+ using state_t = TfaState<N>;
+ auto id_of = [state_ids = std::map<state_t,uint32_t>()](state_t state) mutable {
+ return state_ids.emplace(state, state_ids.size()).first->second;
+ };
+ auto explore = [explored = std::set<state_t>()](state_t state) mutable {
+ return explored.insert(state).second;
+ };
+ auto edits = [end = (_lookup.size() - 1)](state_t state) noexcept -> uint32_t {
+ return state.exact_edits(end);
+ };
+ struct Edge {
+ uint32_t input;
+ state_t from;
+ state_t to;
+ operator bool() const { return to.valid(); }
+ };
+ auto best_edge = [&](const Edge &a, const Edge &b) {
+ // inverted due to priority queue
+ if (a.to.min_edits() == b.to.min_edits()) {
+ return edits(a.to) > edits(b.to);
+ }
+ return a.to.min_edits() > b.to.min_edits();
+ };
+ auto todo = std::priority_queue<Edge,std::vector<Edge>,decltype(best_edge)>(best_edge);
+ auto handle_state = [&](state_t state) {
+ if (explore(state)) {
+ // number states by following the best edges first
+ auto my_id = id_of(state);
+ if (edits(state) <= N) {
+ os << " " << my_id << " [label=\"" << my_id << "(" << edits(state) << ")\", style=\"filled\"];\n";
+ }
+ auto null_edge = Edge{0, state, state.next(0)};
+ if (null_edge) {
+ todo.push(null_edge);
+ }
+ for (auto [c, bits]: _lookup[state.index].list) {
+ if (auto edge = Edge{c, state, state.next(bits)}; edge && edge.to != null_edge.to) {
+ // only process valid out edges that are not covered by the null_edge
+ todo.push(edge);
+ }
+ }
+ }
+ };
+ auto handle_edge = [&](Edge edge) {
+ handle_state(edge.to);
+ if (edge.input != 0) {
+ std::string as_utf8;
+ append_utf32_char(as_utf8, edge.input);
+ os << " " << id_of(edge.from) << " -> " << id_of(edge.to) << " [label=\"" << as_utf8 << "\"];\n";
+ } else {
+ os << " " << id_of(edge.from) << " -> " << id_of(edge.to) << " [label=\"*\"];\n";
+ }
+ };
+ os << std::dec << "digraph table_dfa {\n";
+ os << " fontname=\"Helvetica,Arial,sans-serif\"\n";
+ os << " node [shape=circle, fontname=\"Helvetica,Arial,sans-serif\", fixedsize=true];\n";
+ os << " edge [fontname=\"Helvetica,Arial,sans-serif\"];\n";
+ handle_state(state_t::start());
+ while (!todo.empty()) {
+ auto next_edge = todo.top();
+ todo.pop();
+ handle_edge(next_edge);
+ }
+ os << "}\n";
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/stllike/hashtable.h b/vespalib/src/vespa/vespalib/stllike/hashtable.h
index e290d2f626c..fa88bb038b4 100644
--- a/vespalib/src/vespa/vespalib/stllike/hashtable.h
+++ b/vespalib/src/vespa/vespalib/stllike/hashtable.h
@@ -62,7 +62,7 @@ public:
class prime_modulator
{
public:
- prime_modulator(next_t sizeOfHashTable) noexcept : _modulo(sizeOfHashTable) { }
+ explicit prime_modulator(next_t sizeOfHashTable) noexcept : _modulo(sizeOfHashTable) { }
next_t modulo(next_t hash) const noexcept { return hash % _modulo; }
next_t getTableSize() const noexcept { return _modulo; }
static next_t selectHashTableSize(size_t sz) { return hashtable_base::getModuloStl(sz); }
@@ -76,7 +76,7 @@ public:
class and_modulator
{
public:
- and_modulator(next_t sizeOfHashTable) noexcept : _mask(sizeOfHashTable-1) { }
+ explicit and_modulator(next_t sizeOfHashTable) noexcept : _mask(sizeOfHashTable-1) { }
next_t modulo(next_t hash) const noexcept { return hash & _mask; }
next_t getTableSize() const noexcept { return _mask + 1; }
static next_t selectHashTableSize(size_t sz) noexcept { return hashtable_base::getModuloSimple(sz); }
@@ -198,7 +198,7 @@ public:
using pointer = Value*;
using iterator_category = std::forward_iterator_tag;
- constexpr iterator(hashtable * hash) noexcept : _current(0), _hashTable(hash) {
+ constexpr explicit iterator(hashtable * hash) noexcept : _current(0), _hashTable(hash) {
if (! _hashTable->_nodes[_current].valid()) {
advanceToNextValidHash();
}
@@ -242,7 +242,7 @@ public:
using pointer = const Value*;
using iterator_category = std::forward_iterator_tag;
- constexpr const_iterator(const hashtable * hash) noexcept : _current(0), _hashTable(hash) {
+ constexpr explicit const_iterator(const hashtable * hash) noexcept : _current(0), _hashTable(hash) {
if (! _hashTable->_nodes[_current].valid()) {
advanceToNextValidHash();
}
@@ -282,7 +282,7 @@ public:
hashtable & operator = (hashtable &&) noexcept = default;
hashtable(const hashtable &);
hashtable & operator = (const hashtable &);
- hashtable(size_t reservedSpace);
+ explicit hashtable(size_t reservedSpace);
hashtable(size_t reservedSpace, const Hash & hasher, const Equal & equal);
virtual ~hashtable();
constexpr iterator begin() noexcept { return iterator(this); }
diff --git a/vespalib/src/vespa/vespalib/stllike/hashtable.hpp b/vespalib/src/vespa/vespalib/stllike/hashtable.hpp
index 6d2d397a887..040e421f68c 100644
--- a/vespalib/src/vespa/vespalib/stllike/hashtable.hpp
+++ b/vespalib/src/vespa/vespalib/stllike/hashtable.hpp
@@ -53,8 +53,7 @@ hashtable<Key, Value, Hash, Equal, KeyExtract, Modulator>::hashtable(size_t rese
_nodes(createStore<NodeStore>(reservedSpace, _modulator.getTableSize())),
_hasher(hasher),
_equal(equal)
-{
-}
+{ }
template< typename Key, typename Value, typename Hash, typename Equal, typename KeyExtract, typename Modulator >
hashtable<Key, Value, Hash, Equal, KeyExtract, Modulator>::hashtable(const hashtable &) = default;
@@ -130,7 +129,7 @@ hashtable<Key, Value, Hash, Equal, KeyExtract, Modulator>::insert_internal(V &&
_count++;
return insert_result(iterator(this, h), true);
}
- return insert_internal_cold(std::move(node), h);
+ return insert_internal_cold(std::forward<V>(node), h);
}
template< typename Key, typename Value, typename Hash, typename Equal, typename KeyExtract, typename Modulator >
diff --git a/vespalib/src/vespa/vespalib/text/utf8.h b/vespalib/src/vespa/vespalib/text/utf8.h
index 99e3f8cfe13..489b16b1ed4 100644
--- a/vespalib/src/vespa/vespalib/text/utf8.h
+++ b/vespalib/src/vespa/vespalib/text/utf8.h
@@ -321,6 +321,7 @@ public:
return i;
}
+ const char* get_current_ptr() const noexcept { return _p; }
};
diff --git a/vespalib/src/vespa/vespalib/util/address_space.h b/vespalib/src/vespa/vespalib/util/address_space.h
index 948217bfd2b..fd172427720 100644
--- a/vespalib/src/vespa/vespalib/util/address_space.h
+++ b/vespalib/src/vespa/vespalib/util/address_space.h
@@ -2,6 +2,7 @@
#pragma once
+#include <cstddef>
#include <iosfwd>
namespace vespalib {