diff options
Diffstat (limited to 'searchlib/src/tests/attribute/dfa_fuzzy_matcher')
-rw-r--r-- | searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp | 133 |
1 files changed, 115 insertions, 18 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp index 77e23a58163..c2a39779061 100644 --- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp +++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp @@ -8,6 +8,7 @@ #include <vespa/vespalib/fuzzy/levenshtein_dfa.h> #include <vespa/vespalib/gtest/gtest.h> #include <vespa/vespalib/util/time.h> +#include <vespa/vespalib/text/utf8.h> #include <filesystem> #include <fstream> #include <iostream> @@ -28,12 +29,22 @@ using vespalib::FuzzyMatcher; using vespalib::datastore::AtomicEntryRef; using vespalib::datastore::EntryRef; using vespalib::fuzzy::LevenshteinDfa; +using vespalib::Utf8Reader; +using vespalib::Utf8Writer; using StringEnumStore = EnumStoreT<const char*>; using DictionaryEntry = std::pair<std::string, size_t>; using RawDictionary = std::vector<DictionaryEntry>; using StringVector = std::vector<std::string>; +namespace { + +const char* char_from_u8(const char8_t* p) { + return reinterpret_cast<const char*>(p); +} + +} + RawDictionary read_dictionary() { @@ -110,11 +121,11 @@ struct MatchStats { template <bool collect_matches> void -brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words) +brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - FuzzyMatcher matcher(target, 2, prefix_size, false); + FuzzyMatcher matcher(target, 2, prefix_size, cased); auto itr = view.begin(); size_t matches = 0; size_t seeks = 0; @@ -134,12 +145,18 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS template <bool collect_matches> void -dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words) +dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - DfaFuzzyMatcher matcher(target, 2, prefix_size, false, LevenshteinDfa::DfaType::Explicit); - std::string target_copy(target.substr(0, prefix_size)); + DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit); + Utf8Reader reader(vespalib::stringref(target.data(), target.size())); + std::string target_copy; + Utf8Writer<std::string> writer(target_copy); + for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) { + auto code_point = reader.getChar(); + writer.putChar(code_point); + } auto prefix_cmp = store.make_folded_comparator_prefix(target_copy.c_str()); auto itr = prefix_size > 0 ? view.lowerBound(AtomicEntryRef(), prefix_cmp) : view.begin(); auto itr_end = itr; @@ -169,10 +186,58 @@ dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& st stats.add_sample(matches, seeks, timer.elapsed()); } -struct DfaFuzzyMatcherTest : public ::testing::Test { +template <bool collect_matches> +void +dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) +{ + auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); + vespalib::Timer timer; + DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit); + auto itr = view.begin(); + size_t matches = 0; + size_t seeks = 0; + for (;itr.valid(); ++itr) { + auto word = store.get_value(itr.getKey().load_relaxed()); + if (matcher.is_match(word)) { + ++matches; + if (collect_matches) { + matched_words.push_back(word); + } + } else { + ++seeks; + } + } + stats.add_sample(matches, seeks, timer.elapsed()); +} + +struct TestParam +{ + vespalib::string _name; + bool _cased; + + TestParam(vespalib::string name, bool cased) + : _name(std::move(name)), + _cased(cased) + { + } + TestParam(const TestParam&); + ~TestParam(); +}; + +TestParam::TestParam(const TestParam&) = default; + +TestParam::~TestParam() = default; + +std::ostream& operator<<(std::ostream& os, const TestParam& param) +{ + os << param._name; + return os; +} + +struct DfaFuzzyMatcherTest : public ::testing::TestWithParam<TestParam> { StringEnumStore store; DfaFuzzyMatcherTest() - : store(true, DictionaryConfig(DictionaryConfig::Type::BTREE, DictionaryConfig::Match::UNCASED)) + : store(true, DictionaryConfig(DictionaryConfig::Type::BTREE, GetParam()._cased ? DictionaryConfig::Match::CASED : DictionaryConfig::Match::UNCASED)) {} void populate_dictionary(const StringVector& words) { auto updater = store.make_batch_updater(); @@ -187,18 +252,27 @@ struct DfaFuzzyMatcherTest : public ::testing::Test { MatchStats stats; StringVector brute_force_matches; StringVector dfa_matches; + StringVector dfa_no_skip_matches; + bool cased = GetParam()._cased; SCOPED_TRACE(target); - brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, brute_force_matches); - dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, dfa_matches); + brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, brute_force_matches); + dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, dfa_matches); + dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, stats, dfa_no_skip_matches); EXPECT_EQ(exp_matches, brute_force_matches); EXPECT_EQ(exp_matches, dfa_matches); + EXPECT_EQ(exp_matches, dfa_no_skip_matches); } void expect_matches(std::string_view target, const StringVector& exp_matches) { expect_prefix_matches(target, 0, exp_matches); } }; -TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) +INSTANTIATE_TEST_SUITE_P(DfaFuzzyMatcherMultiTest, + DfaFuzzyMatcherTest, + testing::Values(TestParam("uncased", false), TestParam("cased", true)), + testing::PrintToStringParamName()); + +TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) { StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill", "for", "forbid", "force", "ford", "forearm", "forecast", "forest" }; @@ -211,10 +285,11 @@ TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) expect_matches("forcecast", {"forecast"}); } -TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) +TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) { + bool cased = GetParam()._cased; StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill", - "for", "forbid", "force", "ford", "forearm", "forecast", "forest" }; + "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")}; populate_dictionary(words); expect_prefix_matches("a", 1, {}); expect_prefix_matches("b", 1, {"bob"}); @@ -231,25 +306,46 @@ TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) expect_prefix_matches("forcecast", 1, {"forecast"}); expect_prefix_matches("forcecast", 4, {}); expect_prefix_matches("z", 1, {}); + if (cased) { + expect_prefix_matches("h", 1, {"h", "ha"}); + expect_prefix_matches(char_from_u8(u8"Ø"), 1, {char_from_u8(u8"Ørn")}); + expect_prefix_matches(char_from_u8(u8"ø"), 1, {char_from_u8(u8"øre")}); + expect_prefix_matches(char_from_u8(u8"å"), 1, {char_from_u8(u8"ås")}); + /* Corner case: prefix length > target length means exact match */ + expect_prefix_matches("h", 2, {"h"}); + } else { + expect_prefix_matches("h", 1, {"H", "h", "HA", "ha"}); + expect_prefix_matches(char_from_u8(u8"ø"), 1, {char_from_u8(u8"øre"), char_from_u8(u8"Ørn")}); + expect_prefix_matches(char_from_u8(u8"å"), 1, {char_from_u8(u8"Ås"), char_from_u8(u8"ås")}); + /* Corner case: prefix length > target length means exact match */ + expect_prefix_matches("h", 2, {"H", "h"}); + } } void -benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool dfa_algorithm) +benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool cased, bool dfa_algorithm) { MatchStats stats; StringVector dummy; for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) { const auto& entry = dict[i]; if (dfa_algorithm) { - dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy); + dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy); } else { - brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy); + brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy); } } std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl; } -TEST_F(DfaFuzzyMatcherTest, benchmark_fuzzy_match_in_dictionary) +using DfaFuzzyMatcherBenchmarkTest = DfaFuzzyMatcherTest; + +INSTANTIATE_TEST_SUITE_P(DfaFuzzyMatcherBenchmarkMultiTest, + DfaFuzzyMatcherBenchmarkTest, + testing::Values(TestParam("uncased", false)), + testing::PrintToStringParamName()); + +TEST_P(DfaFuzzyMatcherBenchmarkTest, benchmark_fuzzy_match_in_dictionary) { if (!benchmarking_enabled()) { GTEST_SKIP() << "benchmarking not enabled"; @@ -258,8 +354,9 @@ TEST_F(DfaFuzzyMatcherTest, benchmark_fuzzy_match_in_dictionary) populate_dictionary(to_string_vector(dict)); std::cout << "Unique words: " << store.get_num_uniques() << std::endl; sort_by_freq(dict); - benchmark_fuzzy_match_in_dictionary(store, dict, dfa_words_to_match, true); - benchmark_fuzzy_match_in_dictionary(store, dict, brute_force_words_to_match, false); + bool cased = GetParam()._cased; + benchmark_fuzzy_match_in_dictionary(store, dict, dfa_words_to_match, cased, true); + benchmark_fuzzy_match_in_dictionary(store, dict, brute_force_words_to_match, cased, false); } int |