aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2023-09-26 00:42:38 +0200
committerGitHub <noreply@github.com>2023-09-26 00:42:38 +0200
commitb35fddd5fa10d9fda2bdd7e2d218ab9087883a78 (patch)
tree97100ac536c81b97aa4c9c8916aba7a0a3fc6928
parentec46db5cf1bc513c01c2827b09468f902063b821 (diff)
parent4216068da71f3be3768028054c3dea5b687b0aa1 (diff)
Merge pull request #28655 from vespa-engine/geirst/integrate-dfa-fuzzy-matcherv8.232.19
Integrate DFA-based fuzzy matching.
-rw-r--r--searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h6
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_matcher.h5
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp40
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.h10
5 files changed, 54 insertions, 11 deletions
diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
index e9d0f8cb736..52329f31ba7 100644
--- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
+++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
@@ -389,8 +389,8 @@ testSingleValue(Attribute & svsa, Config &cfg)
TEST("testSingleValue")
{
EXPECT_EQUAL(24u, sizeof(SearchContext));
- EXPECT_EQUAL(40u, sizeof(StringSearchHelper));
- EXPECT_EQUAL(96u, sizeof(attribute::SingleStringEnumSearchContext));
+ EXPECT_EQUAL(48u, sizeof(StringSearchHelper));
+ EXPECT_EQUAL(104u, sizeof(attribute::SingleStringEnumSearchContext));
{
Config cfg(BasicType::STRING, CollectionType::SINGLE);
SingleValueStringAttribute svsa("svsa", cfg);
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
index 05ccedb39ec..6f148d1d5ba 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
@@ -320,11 +320,7 @@ StringPostingSearchContext<BaseSC, AttrT, DataT>::use_dictionary_entry(PostingLi
++it;
return false;
} else if (this->isFuzzy()) {
- if (this->getFuzzyMatcher().isMatch(_enumStore.get_value(it.getKey().load_acquire()))) {
- return true;
- }
- ++it;
- return false;
+ return this->is_fuzzy_match(_enumStore.get_value(it.getKey().load_acquire()), it, _enumStore.get_data_store());
}
return true;
}
diff --git a/searchlib/src/vespa/searchlib/attribute/string_matcher.h b/searchlib/src/vespa/searchlib/attribute/string_matcher.h
index 05089e1251a..09ba813cefe 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/string_matcher.h
@@ -32,6 +32,11 @@ protected:
const vespalib::Regex& getRegex() const { return _helper.getRegex(); }
const vespalib::FuzzyMatcher& getFuzzyMatcher() const { return _helper.getFuzzyMatcher(); }
const QueryTermUCS4* get_query_term_ptr() const noexcept { return _query_term.get(); }
+
+ template <typename DictionaryConstIteratorType>
+ bool is_fuzzy_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) const {
+ return _helper.is_fuzzy_match(word, itr, data_store);
+ }
};
}
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
index 1efe39667b8..aec317926f1 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
@@ -1,6 +1,8 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "string_search_helper.h"
+#include "dfa_fuzzy_matcher.h"
+#include "i_enum_store_dictionary.h"
#include <vespa/searchlib/query/query_term_ucs4.h>
#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
@@ -12,6 +14,7 @@ namespace search::attribute {
StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm)
: _regex(),
_fuzzyMatcher(),
+ _dfa_fuzzy_matcher(),
_term(),
_termLen(),
_isPrefix(term.isPrefix()),
@@ -24,12 +27,20 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespali
? vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None)
: vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase);
} else if (isFuzzy()) {
- (void) fuzzy_matching_algorithm;
- // TODO: Select implementation based on algorithm.
_fuzzyMatcher = std::make_unique<vespalib::FuzzyMatcher>(term.getTerm(),
term.getFuzzyMaxEditDistance(),
term.getFuzzyPrefixLength(),
isCased());
+ using FMA = vespalib::FuzzyMatchingAlgorithm;
+ using LDT = vespalib::fuzzy::LevenshteinDfa::DfaType;
+ if ((fuzzy_matching_algorithm != FMA::BruteForce) &&
+ (term.getFuzzyMaxEditDistance() <= 2)) {
+ _dfa_fuzzy_matcher = std::make_unique<DfaFuzzyMatcher>(term.getTerm(),
+ term.getFuzzyMaxEditDistance(),
+ term.getFuzzyPrefixLength(),
+ isCased(),
+ (fuzzy_matching_algorithm == FMA::DfaImplicit) ? LDT::Implicit : LDT::Explicit);
+ }
} else if (isCased()) {
_term = term.getTerm();
_termLen = strlen(_term);
@@ -48,7 +59,7 @@ StringSearchHelper::isMatch(const char *src) const noexcept {
return getRegex().valid() && getRegex().partial_match(std::string_view(src));
}
if (__builtin_expect(isFuzzy(), false)) {
- return getFuzzyMatcher().isMatch(src);
+ return _dfa_fuzzy_matcher ? _dfa_fuzzy_matcher->is_match(src) : getFuzzyMatcher().isMatch(src);
}
if (__builtin_expect(isCased(), false)) {
int res = strncmp(_term, src, _termLen);
@@ -67,4 +78,27 @@ StringSearchHelper::isMatch(const char *src) const noexcept {
return (_ucs4[j] == 0 && (val == 0 || isPrefix()));
}
+template <typename DictionaryConstIteratorType>
+bool
+StringSearchHelper::is_fuzzy_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) const
+{
+ if (_dfa_fuzzy_matcher) {
+ return _dfa_fuzzy_matcher->is_match(word, itr, data_store);
+ } else {
+ if (_fuzzyMatcher->isMatch(word)) {
+ return true;
+ }
+ ++itr;
+ return false;
+ }
+}
+
+template
+bool
+StringSearchHelper::is_fuzzy_match(const char*, EnumPostingTree::ConstIterator&, const DfaStringComparator::DataStoreType&) const;
+
+template
+bool
+StringSearchHelper::is_fuzzy_match(const char*, EnumTree::ConstIterator&, const DfaStringComparator::DataStoreType&) const;
+
}
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.h b/searchlib/src/vespa/searchlib/attribute/string_search_helper.h
index 0e7a116a874..e59291e24a7 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.h
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.h
@@ -2,6 +2,7 @@
#pragma once
+#include "dfa_string_comparator.h"
#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h>
#include <vespa/vespalib/regex/regex.h>
@@ -10,6 +11,8 @@ namespace search { class QueryTermUCS4; }
namespace search::attribute {
+class DfaFuzzyMatcher;
+
/**
* Helper class for search context when scanning string fields
* It handles different search settings like prefix, regex and cased/uncased.
@@ -29,11 +32,16 @@ public:
bool isCased() const noexcept { return _isCased; }
bool isFuzzy() const noexcept { return _isFuzzy; }
const vespalib::Regex & getRegex() const noexcept { return _regex; }
- const FuzzyMatcher & getFuzzyMatcher() const noexcept { return *_fuzzyMatcher; }
+ const FuzzyMatcher& getFuzzyMatcher() const noexcept { return *_fuzzyMatcher; }
+
+ template <typename DictionaryConstIteratorType>
+ bool is_fuzzy_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) const;
+
private:
using ucs4_t = uint32_t;
vespalib::Regex _regex;
std::unique_ptr<FuzzyMatcher> _fuzzyMatcher;
+ std::unique_ptr<DfaFuzzyMatcher> _dfa_fuzzy_matcher;
std::unique_ptr<ucs4_t[]> _ucs4;
const char * _term;
uint32_t _termLen; // measured in bytes