diff options
author | Geir Storli <geirst@yahooinc.com> | 2023-09-25 11:35:44 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-25 11:35:44 +0200 |
commit | 840fd17d00103a880a8d87e82b9f28bc228d1336 (patch) | |
tree | 8ddb62facbebc83e6fac274fcb0722323d8f1319 /searchlib/src | |
parent | 4d5c9f2d3033a4e51b916dcde569f8f0451bcee0 (diff) | |
parent | 60eb4d43fce41b252f9e16cf951a009f1d64f0ed (diff) |
Merge pull request #28624 from vespa-engine/toregge/add-prefix-size-constructor-argument-to-dfa-fuzzy-matcher
Add prefix_size constructor argument to DfaFuzzyMatcher.
Diffstat (limited to 'searchlib/src')
3 files changed, 127 insertions, 22 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp index c7bd0e917f3..77e23a58163 100644 --- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp +++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp @@ -26,6 +26,7 @@ using namespace search::attribute; using namespace search; using vespalib::FuzzyMatcher; using vespalib::datastore::AtomicEntryRef; +using vespalib::datastore::EntryRef; using vespalib::fuzzy::LevenshteinDfa; using StringEnumStore = EnumStoreT<const char*>; @@ -109,11 +110,11 @@ struct MatchStats { template <bool collect_matches> void -brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words) +brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - FuzzyMatcher matcher(target, 2, 0, false); + FuzzyMatcher matcher(target, 2, prefix_size, false); auto itr = view.begin(); size_t matches = 0; size_t seeks = 0; @@ -133,15 +134,27 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS template <bool collect_matches> void -dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words) +dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - DfaFuzzyMatcher matcher(target, 2, false, LevenshteinDfa::DfaType::Explicit); - auto itr = view.begin(); + DfaFuzzyMatcher matcher(target, 2, prefix_size, false, LevenshteinDfa::DfaType::Explicit); + std::string target_copy(target.substr(0, prefix_size)); + auto prefix_cmp = store.make_folded_comparator_prefix(target_copy.c_str()); + auto itr = prefix_size > 0 ? view.lowerBound(AtomicEntryRef(), prefix_cmp) : view.begin(); + auto itr_end = itr; + if (itr_end.valid()) { + if (prefix_size > 0) { + if (!prefix_cmp.less(EntryRef(), itr_end.getKey().load_relaxed())) { + itr_end.seekPast(AtomicEntryRef(), prefix_cmp); + } + } else { + itr_end.end(); + } + } size_t matches = 0; size_t seeks = 0; - while (itr.valid()) { + while (itr != itr_end) { auto word = store.get_value(itr.getKey().load_relaxed()); if (matcher.is_match(word, itr, store.get_data_store())) { ++itr; @@ -170,15 +183,19 @@ struct DfaFuzzyMatcherTest : public ::testing::Test { updater.commit(); store.freeze_dictionary(); } - void expect_matches(std::string_view target, const StringVector& exp_matches) { + void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) { MatchStats stats; StringVector brute_force_matches; StringVector dfa_matches; - brute_force_fuzzy_match_in_dictionary<true>(target, store, stats, brute_force_matches); - dfa_fuzzy_match_in_dictionary<true>(target, store, stats, dfa_matches); + SCOPED_TRACE(target); + brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, brute_force_matches); + dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, dfa_matches); EXPECT_EQ(exp_matches, brute_force_matches); EXPECT_EQ(exp_matches, dfa_matches); } + void expect_matches(std::string_view target, const StringVector& exp_matches) { + expect_prefix_matches(target, 0, exp_matches); + } }; TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) @@ -194,6 +211,28 @@ TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) expect_matches("forcecast", {"forecast"}); } +TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) +{ + StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill", + "for", "forbid", "force", "ford", "forearm", "forecast", "forest" }; + populate_dictionary(words); + expect_prefix_matches("a", 1, {}); + expect_prefix_matches("b", 1, {"bob"}); + expect_prefix_matches("board", 1, {"board", "boat"}); + expect_prefix_matches("c", 1, {}); + expect_prefix_matches("food", 1, {"food", "foot", "for", "ford"}); + expect_prefix_matches("food", 2, {"food", "foot", "for", "ford"}); + expect_prefix_matches("food", 3, {"food", "foot"}); + expect_prefix_matches("foothill", 1, {"football", "foothill"}); + expect_prefix_matches("for", 1, {"food", "foot", "for", "force", "ford"}); + expect_prefix_matches("for", 2, {"food", "foot", "for", "force", "ford"}); + expect_prefix_matches("for", 3, {"for", "force", "ford"}); + expect_prefix_matches("force", 1, {"for", "force", "ford"}); + expect_prefix_matches("forcecast", 1, {"forecast"}); + expect_prefix_matches("forcecast", 4, {}); + expect_prefix_matches("z", 1, {}); +} + void benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool dfa_algorithm) { @@ -202,9 +241,9 @@ benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDicti for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) { const auto& entry = dict[i]; if (dfa_algorithm) { - dfa_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy); + dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy); } else { - brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy); + brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy); } } std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl; diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp index 580c34bd5d0..f66dd56c72f 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp @@ -1,17 +1,70 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "dfa_fuzzy_matcher.h" +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/text/lowercase.h> using vespalib::fuzzy::LevenshteinDfa; +using vespalib::LowerCase; +using vespalib::Utf8Reader; +using vespalib::Utf8ReaderForZTS; namespace search::attribute { -DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, LevenshteinDfa::DfaType dfa_type) - : _dfa(vespalib::fuzzy::LevenshteinDfa::build(target, max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)), - _successor() +namespace { + +std::vector<uint32_t> +extract_prefix(std::string_view target, uint32_t prefix_size, bool cased) +{ + std::vector<uint32_t> result; + result.reserve(prefix_size); + Utf8Reader reader(vespalib::stringref(target.data(), target.size())); + for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) { + uint32_t code_point = reader.getChar(); + if (!cased) { + code_point = LowerCase::convert(code_point); + } + result.emplace_back(code_point); + } + return result; +} + +std::string_view +extract_suffix(std::string_view target, uint32_t prefix_size) +{ + Utf8Reader reader(vespalib::stringref(target.data(), target.size())); + for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) { + (void) reader.getChar(); + } + std::string_view result = target; + result.remove_prefix(reader.getPos()); + return result; +} + +} + +DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, LevenshteinDfa::DfaType dfa_type) + : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)), + _successor(), + _prefix(extract_prefix(target, prefix_size, cased)), + _successor_suffix(), + _prefix_size(prefix_size) { + _successor = _prefix; } DfaFuzzyMatcher::~DfaFuzzyMatcher() = default; +const char* +DfaFuzzyMatcher::skip_prefix(const char* word) const +{ + Utf8ReaderForZTS reader(word); + size_t pos = 0; + for (; pos < _prefix_size && reader.hasMore(); ++pos) { + (void) reader.getChar(); + } + assert(pos == _prefix_size); + return reader.get_current_ptr(); +} + } diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h index fcba13f85a4..3d24948e044 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h @@ -17,22 +17,35 @@ namespace search::attribute { class DfaFuzzyMatcher { private: vespalib::fuzzy::LevenshteinDfa _dfa; - std::vector<uint32_t> _successor; + std::vector<uint32_t> _successor; + std::vector<uint32_t> _prefix; + std::vector<uint32_t> _successor_suffix; + uint32_t _prefix_size; + const char*skip_prefix(const char* word) const; public: - DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); + DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); ~DfaFuzzyMatcher(); template <typename DictionaryConstIteratorType> bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) { - auto match = _dfa.match(word, _successor); - if (match.matches()) { - return true; + if (_prefix_size > 0) { + word = skip_prefix(word); + auto match = _dfa.match(word, _successor_suffix); + if (match.matches()) { + return true; + } + _successor.resize(_prefix.size()); + _successor.insert(_successor.end(), _successor_suffix.begin(), _successor_suffix.end()); } else { - DfaStringComparator cmp(data_store, _successor); - itr.seek(vespalib::datastore::AtomicEntryRef(), cmp); - return false; + auto match = _dfa.match(word, _successor); + if (match.matches()) { + return true; + } } + DfaStringComparator cmp(data_store, _successor); + itr.seek(vespalib::datastore::AtomicEntryRef(), cmp); + return false; } }; |