diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-09-25 13:33:40 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-25 13:33:40 +0200 |
commit | 7c2b17a3a91890b592117986799d84e8e26b1ad7 (patch) | |
tree | 65160a8e39f24ab5ca48e4f3d77eb44c7df5ad70 /searchlib/src | |
parent | cc7800772354a55059a6aaaf07d3278b91e558cd (diff) | |
parent | 83c7120ea16e0b09e0b8ee47ae254d0a83f8649d (diff) |
Merge branch 'master' into balder/lift-single-filter-terms-out-from-ws
Diffstat (limited to 'searchlib/src')
11 files changed, 171 insertions, 59 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp index c7bd0e917f3..77e23a58163 100644 --- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp +++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp @@ -26,6 +26,7 @@ using namespace search::attribute; using namespace search; using vespalib::FuzzyMatcher; using vespalib::datastore::AtomicEntryRef; +using vespalib::datastore::EntryRef; using vespalib::fuzzy::LevenshteinDfa; using StringEnumStore = EnumStoreT<const char*>; @@ -109,11 +110,11 @@ struct MatchStats { template <bool collect_matches> void -brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words) +brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - FuzzyMatcher matcher(target, 2, 0, false); + FuzzyMatcher matcher(target, 2, prefix_size, false); auto itr = view.begin(); size_t matches = 0; size_t seeks = 0; @@ -133,15 +134,27 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS template <bool collect_matches> void -dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words) +dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - DfaFuzzyMatcher matcher(target, 2, false, LevenshteinDfa::DfaType::Explicit); - auto itr = view.begin(); + DfaFuzzyMatcher matcher(target, 2, prefix_size, false, LevenshteinDfa::DfaType::Explicit); + std::string target_copy(target.substr(0, prefix_size)); + auto prefix_cmp = store.make_folded_comparator_prefix(target_copy.c_str()); + auto itr = prefix_size > 0 ? view.lowerBound(AtomicEntryRef(), prefix_cmp) : view.begin(); + auto itr_end = itr; + if (itr_end.valid()) { + if (prefix_size > 0) { + if (!prefix_cmp.less(EntryRef(), itr_end.getKey().load_relaxed())) { + itr_end.seekPast(AtomicEntryRef(), prefix_cmp); + } + } else { + itr_end.end(); + } + } size_t matches = 0; size_t seeks = 0; - while (itr.valid()) { + while (itr != itr_end) { auto word = store.get_value(itr.getKey().load_relaxed()); if (matcher.is_match(word, itr, store.get_data_store())) { ++itr; @@ -170,15 +183,19 @@ struct DfaFuzzyMatcherTest : public ::testing::Test { updater.commit(); store.freeze_dictionary(); } - void expect_matches(std::string_view target, const StringVector& exp_matches) { + void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) { MatchStats stats; StringVector brute_force_matches; StringVector dfa_matches; - brute_force_fuzzy_match_in_dictionary<true>(target, store, stats, brute_force_matches); - dfa_fuzzy_match_in_dictionary<true>(target, store, stats, dfa_matches); + SCOPED_TRACE(target); + brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, brute_force_matches); + dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, dfa_matches); EXPECT_EQ(exp_matches, brute_force_matches); EXPECT_EQ(exp_matches, dfa_matches); } + void expect_matches(std::string_view target, const StringVector& exp_matches) { + expect_prefix_matches(target, 0, exp_matches); + } }; TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) @@ -194,6 +211,28 @@ TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) expect_matches("forcecast", {"forecast"}); } +TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) +{ + StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill", + "for", "forbid", "force", "ford", "forearm", "forecast", "forest" }; + populate_dictionary(words); + expect_prefix_matches("a", 1, {}); + expect_prefix_matches("b", 1, {"bob"}); + expect_prefix_matches("board", 1, {"board", "boat"}); + expect_prefix_matches("c", 1, {}); + expect_prefix_matches("food", 1, {"food", "foot", "for", "ford"}); + expect_prefix_matches("food", 2, {"food", "foot", "for", "ford"}); + expect_prefix_matches("food", 3, {"food", "foot"}); + expect_prefix_matches("foothill", 1, {"football", "foothill"}); + expect_prefix_matches("for", 1, {"food", "foot", "for", "force", "ford"}); + expect_prefix_matches("for", 2, {"food", "foot", "for", "force", "ford"}); + expect_prefix_matches("for", 3, {"for", "force", "ford"}); + expect_prefix_matches("force", 1, {"for", "force", "ford"}); + expect_prefix_matches("forcecast", 1, {"forecast"}); + expect_prefix_matches("forcecast", 4, {}); + expect_prefix_matches("z", 1, {}); +} + void benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool dfa_algorithm) { @@ -202,9 +241,9 @@ benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDicti for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) { const auto& entry = dict[i]; if (dfa_algorithm) { - dfa_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy); + dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy); } else { - brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy); + brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy); } } std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl; diff --git a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp index ef0fd56840a..c617db871a7 100644 --- a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp +++ b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp @@ -128,9 +128,9 @@ TEST("test And propagates updated histestimate") { const RememberExecuteInfo & child = dynamic_cast<const RememberExecuteInfo &>(bp.getChild(i)); EXPECT_EQUAL((i == 0), child.executeInfo.isStrict()); } - EXPECT_EQUAL(1.0, dynamic_cast<const RememberExecuteInfo &>(bp.getChild(0)).executeInfo.hitRate()); - EXPECT_EQUAL(1.0/250, dynamic_cast<const RememberExecuteInfo &>(bp.getChild(1)).executeInfo.hitRate()); - EXPECT_EQUAL(1.0/(250*25), dynamic_cast<const RememberExecuteInfo &>(bp.getChild(2)).executeInfo.hitRate()); + EXPECT_EQUAL(1.0f, dynamic_cast<const RememberExecuteInfo &>(bp.getChild(0)).executeInfo.hitRate()); + EXPECT_EQUAL(1.0f/250, dynamic_cast<const RememberExecuteInfo &>(bp.getChild(1)).executeInfo.hitRate()); + EXPECT_EQUAL(1.0f/(250*25), dynamic_cast<const RememberExecuteInfo &>(bp.getChild(2)).executeInfo.hitRate()); } TEST("test And Blueprint") { diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp index 580c34bd5d0..f66dd56c72f 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp @@ -1,17 +1,70 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "dfa_fuzzy_matcher.h" +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/text/lowercase.h> using vespalib::fuzzy::LevenshteinDfa; +using vespalib::LowerCase; +using vespalib::Utf8Reader; +using vespalib::Utf8ReaderForZTS; namespace search::attribute { -DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, LevenshteinDfa::DfaType dfa_type) - : _dfa(vespalib::fuzzy::LevenshteinDfa::build(target, max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)), - _successor() +namespace { + +std::vector<uint32_t> +extract_prefix(std::string_view target, uint32_t prefix_size, bool cased) +{ + std::vector<uint32_t> result; + result.reserve(prefix_size); + Utf8Reader reader(vespalib::stringref(target.data(), target.size())); + for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) { + uint32_t code_point = reader.getChar(); + if (!cased) { + code_point = LowerCase::convert(code_point); + } + result.emplace_back(code_point); + } + return result; +} + +std::string_view +extract_suffix(std::string_view target, uint32_t prefix_size) +{ + Utf8Reader reader(vespalib::stringref(target.data(), target.size())); + for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) { + (void) reader.getChar(); + } + std::string_view result = target; + result.remove_prefix(reader.getPos()); + return result; +} + +} + +DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, LevenshteinDfa::DfaType dfa_type) + : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)), + _successor(), + _prefix(extract_prefix(target, prefix_size, cased)), + _successor_suffix(), + _prefix_size(prefix_size) { + _successor = _prefix; } DfaFuzzyMatcher::~DfaFuzzyMatcher() = default; +const char* +DfaFuzzyMatcher::skip_prefix(const char* word) const +{ + Utf8ReaderForZTS reader(word); + size_t pos = 0; + for (; pos < _prefix_size && reader.hasMore(); ++pos) { + (void) reader.getChar(); + } + assert(pos == _prefix_size); + return reader.get_current_ptr(); +} + } diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h index fcba13f85a4..3d24948e044 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h @@ -17,22 +17,35 @@ namespace search::attribute { class DfaFuzzyMatcher { private: vespalib::fuzzy::LevenshteinDfa _dfa; - std::vector<uint32_t> _successor; + std::vector<uint32_t> _successor; + std::vector<uint32_t> _prefix; + std::vector<uint32_t> _successor_suffix; + uint32_t _prefix_size; + const char*skip_prefix(const char* word) const; public: - DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); + DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); ~DfaFuzzyMatcher(); template <typename DictionaryConstIteratorType> bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) { - auto match = _dfa.match(word, _successor); - if (match.matches()) { - return true; + if (_prefix_size > 0) { + word = skip_prefix(word); + auto match = _dfa.match(word, _successor_suffix); + if (match.matches()) { + return true; + } + _successor.resize(_prefix.size()); + _successor.insert(_successor.end(), _successor_suffix.begin(), _successor_suffix.end()); } else { - DfaStringComparator cmp(data_store, _successor); - itr.seek(vespalib::datastore::AtomicEntryRef(), cmp); - return false; + auto match = _dfa.match(word, _successor); + if (match.matches()) { + return true; + } } + DfaStringComparator cmp(data_store, _successor); + itr.seek(vespalib::datastore::AtomicEntryRef(), cmp); + return false; } }; diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp index 3f6085ef7ff..94d1a4917fd 100644 --- a/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp @@ -621,7 +621,7 @@ IntermediateBlueprint::fetchPostings(const ExecuteInfo &execInfo) double nextHitRate = execInfo.hitRate(); for (size_t i = 0; i < _children.size(); ++i) { Blueprint & child = *_children[i]; - child.fetchPostings(ExecuteInfo::create(execInfo.isStrict() && inheritStrict(i), nextHitRate)); + child.fetchPostings(ExecuteInfo::create(execInfo.isStrict() && inheritStrict(i), nextHitRate, execInfo.getDoom())); nextHitRate = computeNextHitRate(child, nextHitRate); } } diff --git a/searchlib/src/vespa/searchlib/queryeval/dot_product_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/dot_product_blueprint.cpp index 795f5f1424a..4322cafb5c8 100644 --- a/searchlib/src/vespa/searchlib/queryeval/dot_product_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/dot_product_blueprint.cpp @@ -66,7 +66,7 @@ DotProductBlueprint::createFilterSearch(bool strict, FilterConstraint constraint void DotProductBlueprint::fetchPostings(const ExecuteInfo &execInfo) { - ExecuteInfo childInfo = ExecuteInfo::create(true, execInfo.hitRate()); + ExecuteInfo childInfo = ExecuteInfo::create(true, execInfo); for (size_t i = 0; i < _terms.size(); ++i) { _terms[i]->fetchPostings(childInfo); } diff --git a/searchlib/src/vespa/searchlib/queryeval/executeinfo.cpp b/searchlib/src/vespa/searchlib/queryeval/executeinfo.cpp index 604e20d2262..e5d20f047f5 100644 --- a/searchlib/src/vespa/searchlib/queryeval/executeinfo.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/executeinfo.cpp @@ -4,17 +4,7 @@ namespace search::queryeval { -const ExecuteInfo ExecuteInfo::TRUE(true, 1.0); -const ExecuteInfo ExecuteInfo::FALSE(false, 1.0); - -ExecuteInfo -ExecuteInfo::create(bool strict) { - return create(strict, 1.0); -} - -ExecuteInfo -ExecuteInfo::create(bool strict, double hitRate) { - return ExecuteInfo(strict, hitRate); -} +const ExecuteInfo ExecuteInfo::TRUE(true, 1.0, nullptr); +const ExecuteInfo ExecuteInfo::FALSE(false, 1.0, nullptr); } diff --git a/searchlib/src/vespa/searchlib/queryeval/executeinfo.h b/searchlib/src/vespa/searchlib/queryeval/executeinfo.h index e161b2bdab7..2dd34284bef 100644 --- a/searchlib/src/vespa/searchlib/queryeval/executeinfo.h +++ b/searchlib/src/vespa/searchlib/queryeval/executeinfo.h @@ -1,8 +1,9 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -// Copyright 2019 Oath inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once +#include <vespa/vespalib/util/doom.h> + namespace search::queryeval { /** @@ -11,20 +12,37 @@ namespace search::queryeval { */ class ExecuteInfo { public: - ExecuteInfo() : ExecuteInfo(false, 1.0) { } - bool isStrict() const { return _strict; } - double hitRate() const { return _hitRate; } + ExecuteInfo() noexcept : ExecuteInfo(false, 1.0F, nullptr) { } + bool isStrict() const noexcept { return _strict; } + float hitRate() const noexcept { return _hitRate; } + bool soft_doom() const noexcept { return _doom && _doom->soft_doom(); } + const vespalib::Doom * getDoom() const { return _doom; } static const ExecuteInfo TRUE; static const ExecuteInfo FALSE; - static ExecuteInfo create(bool strict); - static ExecuteInfo create(bool strict, double HitRate); + static ExecuteInfo create(bool strict, const ExecuteInfo & org) noexcept { + return {strict, org._hitRate, org.getDoom()}; + } + static ExecuteInfo create(bool strict, const vespalib::Doom * doom) noexcept { + return create(strict, 1.0F, doom); + } + static ExecuteInfo create(bool strict, float hitRate, const vespalib::Doom * doom) noexcept { + return {strict, hitRate, doom}; + } + static ExecuteInfo create(bool strict) noexcept { + return create(strict, 1.0F); + } + static ExecuteInfo create(bool strict, float hitRate) noexcept { + return create(strict, hitRate, nullptr); + } private: - ExecuteInfo(bool strict, double hitRate_in) - : _hitRate(hitRate_in), + ExecuteInfo(bool strict, float hitRate_in, const vespalib::Doom * doom) noexcept + : _doom(doom), + _hitRate(hitRate_in), _strict(strict) { } - double _hitRate; - bool _strict; + const vespalib::Doom * _doom; + float _hitRate; + bool _strict; }; } diff --git a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp index 9c3910b20f9..16461487525 100644 --- a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp @@ -57,7 +57,7 @@ void SameElementBlueprint::fetchPostings(const ExecuteInfo &execInfo) { for (size_t i = 0; i < _terms.size(); ++i) { - _terms[i]->fetchPostings(ExecuteInfo::create(execInfo.isStrict() && (i == 0), execInfo.hitRate())); + _terms[i]->fetchPostings(ExecuteInfo::create(execInfo.isStrict() && (i == 0), execInfo)); } } diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp index 48a09f099a6..eb6241a99d5 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp @@ -4,7 +4,6 @@ #include "wand_parts.h" #include "parallel_weak_and_search.h" #include <vespa/searchlib/queryeval/field_spec.hpp> -#include <vespa/searchlib/queryeval/emptysearch.h> #include <vespa/searchlib/queryeval/searchiterator.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> #include <vespa/vespalib/objects/visit.hpp> @@ -77,10 +76,10 @@ ParallelWeakAndBlueprint::createLeafSearch(const search::fef::TermFieldMatchData const State &childState = _terms[i]->getState(); assert(childState.numFields() == 1); // TODO: pass ownership with unique_ptr - terms.push_back(wand::Term(_terms[i]->createSearch(*childrenMatchData, true).release(), - _weights[i], - childState.estimate().estHits, - childState.field(0).resolve(*childrenMatchData))); + terms.emplace_back(_terms[i]->createSearch(*childrenMatchData, true).release(), + _weights[i], + childState.estimate().estHits, + childState.field(0).resolve(*childrenMatchData)); } return SearchIterator::UP (ParallelWeakAndSearch::create(terms, @@ -101,9 +100,9 @@ ParallelWeakAndBlueprint::createFilterSearch(bool strict, FilterConstraint const void ParallelWeakAndBlueprint::fetchPostings(const ExecuteInfo & execInfo) { - ExecuteInfo childInfo = ExecuteInfo::create(true, execInfo.hitRate()); - for (size_t i = 0; i < _terms.size(); ++i) { - _terms[i]->fetchPostings(childInfo); + ExecuteInfo childInfo = ExecuteInfo::create(true, execInfo); + for (const auto & _term : _terms) { + _term->fetchPostings(childInfo); } } diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_blueprint.cpp index 72ce6aed36c..97f6bc2e6f8 100644 --- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_blueprint.cpp @@ -125,7 +125,7 @@ WeightedSetTermBlueprint::create_matching_elements_search(const MatchingElements void WeightedSetTermBlueprint::fetchPostings(const ExecuteInfo &execInfo) { - ExecuteInfo childInfo = ExecuteInfo::create(true, execInfo.hitRate()); + ExecuteInfo childInfo = ExecuteInfo::create(true, execInfo); for (const auto & _term : _terms) { _term->fetchPostings(childInfo); } |