From 4216068da71f3be3768028054c3dea5b687b0aa1 Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Mon, 25 Sep 2023 15:54:44 +0000 Subject: Integrate DFA-based fuzzy matching. --- .../stringattribute/stringattribute_test.cpp | 4 +-- .../searchlib/attribute/postinglistsearchcontext.h | 6 +--- .../src/vespa/searchlib/attribute/string_matcher.h | 5 +++ .../searchlib/attribute/string_search_helper.cpp | 40 ++++++++++++++++++++-- .../searchlib/attribute/string_search_helper.h | 10 +++++- 5 files changed, 54 insertions(+), 11 deletions(-) diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp index e9d0f8cb736..52329f31ba7 100644 --- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp +++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp @@ -389,8 +389,8 @@ testSingleValue(Attribute & svsa, Config &cfg) TEST("testSingleValue") { EXPECT_EQUAL(24u, sizeof(SearchContext)); - EXPECT_EQUAL(40u, sizeof(StringSearchHelper)); - EXPECT_EQUAL(96u, sizeof(attribute::SingleStringEnumSearchContext)); + EXPECT_EQUAL(48u, sizeof(StringSearchHelper)); + EXPECT_EQUAL(104u, sizeof(attribute::SingleStringEnumSearchContext)); { Config cfg(BasicType::STRING, CollectionType::SINGLE); SingleValueStringAttribute svsa("svsa", cfg); diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 05ccedb39ec..6f148d1d5ba 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -320,11 +320,7 @@ StringPostingSearchContext::use_dictionary_entry(PostingLi ++it; return false; } else if (this->isFuzzy()) { - if (this->getFuzzyMatcher().isMatch(_enumStore.get_value(it.getKey().load_acquire()))) { - return true; - } - ++it; - return false; + return this->is_fuzzy_match(_enumStore.get_value(it.getKey().load_acquire()), it, _enumStore.get_data_store()); } return true; } diff --git a/searchlib/src/vespa/searchlib/attribute/string_matcher.h b/searchlib/src/vespa/searchlib/attribute/string_matcher.h index 05089e1251a..09ba813cefe 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_matcher.h +++ b/searchlib/src/vespa/searchlib/attribute/string_matcher.h @@ -32,6 +32,11 @@ protected: const vespalib::Regex& getRegex() const { return _helper.getRegex(); } const vespalib::FuzzyMatcher& getFuzzyMatcher() const { return _helper.getFuzzyMatcher(); } const QueryTermUCS4* get_query_term_ptr() const noexcept { return _query_term.get(); } + + template + bool is_fuzzy_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) const { + return _helper.is_fuzzy_match(word, itr, data_store); + } }; } diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp index 1efe39667b8..aec317926f1 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp @@ -1,6 +1,8 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "string_search_helper.h" +#include "dfa_fuzzy_matcher.h" +#include "i_enum_store_dictionary.h" #include #include #include @@ -12,6 +14,7 @@ namespace search::attribute { StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm) : _regex(), _fuzzyMatcher(), + _dfa_fuzzy_matcher(), _term(), _termLen(), _isPrefix(term.isPrefix()), @@ -24,12 +27,20 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespali ? vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None) : vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase); } else if (isFuzzy()) { - (void) fuzzy_matching_algorithm; - // TODO: Select implementation based on algorithm. _fuzzyMatcher = std::make_unique(term.getTerm(), term.getFuzzyMaxEditDistance(), term.getFuzzyPrefixLength(), isCased()); + using FMA = vespalib::FuzzyMatchingAlgorithm; + using LDT = vespalib::fuzzy::LevenshteinDfa::DfaType; + if ((fuzzy_matching_algorithm != FMA::BruteForce) && + (term.getFuzzyMaxEditDistance() <= 2)) { + _dfa_fuzzy_matcher = std::make_unique(term.getTerm(), + term.getFuzzyMaxEditDistance(), + term.getFuzzyPrefixLength(), + isCased(), + (fuzzy_matching_algorithm == FMA::DfaImplicit) ? LDT::Implicit : LDT::Explicit); + } } else if (isCased()) { _term = term.getTerm(); _termLen = strlen(_term); @@ -48,7 +59,7 @@ StringSearchHelper::isMatch(const char *src) const noexcept { return getRegex().valid() && getRegex().partial_match(std::string_view(src)); } if (__builtin_expect(isFuzzy(), false)) { - return getFuzzyMatcher().isMatch(src); + return _dfa_fuzzy_matcher ? _dfa_fuzzy_matcher->is_match(src) : getFuzzyMatcher().isMatch(src); } if (__builtin_expect(isCased(), false)) { int res = strncmp(_term, src, _termLen); @@ -67,4 +78,27 @@ StringSearchHelper::isMatch(const char *src) const noexcept { return (_ucs4[j] == 0 && (val == 0 || isPrefix())); } +template +bool +StringSearchHelper::is_fuzzy_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) const +{ + if (_dfa_fuzzy_matcher) { + return _dfa_fuzzy_matcher->is_match(word, itr, data_store); + } else { + if (_fuzzyMatcher->isMatch(word)) { + return true; + } + ++itr; + return false; + } +} + +template +bool +StringSearchHelper::is_fuzzy_match(const char*, EnumPostingTree::ConstIterator&, const DfaStringComparator::DataStoreType&) const; + +template +bool +StringSearchHelper::is_fuzzy_match(const char*, EnumTree::ConstIterator&, const DfaStringComparator::DataStoreType&) const; + } diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.h b/searchlib/src/vespa/searchlib/attribute/string_search_helper.h index 0e7a116a874..e59291e24a7 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.h +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.h @@ -2,6 +2,7 @@ #pragma once +#include "dfa_string_comparator.h" #include #include @@ -10,6 +11,8 @@ namespace search { class QueryTermUCS4; } namespace search::attribute { +class DfaFuzzyMatcher; + /** * Helper class for search context when scanning string fields * It handles different search settings like prefix, regex and cased/uncased. @@ -29,11 +32,16 @@ public: bool isCased() const noexcept { return _isCased; } bool isFuzzy() const noexcept { return _isFuzzy; } const vespalib::Regex & getRegex() const noexcept { return _regex; } - const FuzzyMatcher & getFuzzyMatcher() const noexcept { return *_fuzzyMatcher; } + const FuzzyMatcher& getFuzzyMatcher() const noexcept { return *_fuzzyMatcher; } + + template + bool is_fuzzy_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) const; + private: using ucs4_t = uint32_t; vespalib::Regex _regex; std::unique_ptr _fuzzyMatcher; + std::unique_ptr _dfa_fuzzy_matcher; std::unique_ptr _ucs4; const char * _term; uint32_t _termLen; // measured in bytes -- cgit v1.2.3