summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-18 10:21:52 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-01-18 10:28:21 +0000
commita8476292a061bf609e24e2731202b634bba46b95 (patch)
tree75f906e26551d545fd49f24b0c9be9e9c74dcab2 /searchlib
parentada510e89058979aecd66598d1853fd09ee0b42d (diff)
Use `string_view` for standalone DFA fuzzy match function
This is to allow for matching non-zero terminated strings, which will be needed if we are to support fuzzy matching of streaming tokenized terms. Note that the underlying DFA API already takes in a string_view, so this is just moving an implicit `strlen` one step higher in the stack. This will pull the string data into the CPU cache ever so slightly earlier than before, but this is not expected to matter in practice since the data will be used immediately after anyway. Added explicit `string_view` instantiations at the call sites to make implicit `strlen` more obviously visible than they currently are with implicit type conversions from `const char*`. Also adds a utility constructor to hide default underlying DFA implementation.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp11
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h9
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp3
4 files changed, 18 insertions, 7 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
index 433ad9e7671..8ba8c62c5ff 100644
--- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -197,7 +197,7 @@ dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumS
size_t seeks = 0;
for (;itr.valid(); ++itr) {
auto word = store.get_value(itr.getKey().load_relaxed());
- if (matcher.is_match(word)) {
+ if (matcher.is_match(std::string_view(word))) {
++matches;
if (collect_matches) {
matched_words.push_back(word);
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
index 9d6e2e9815d..5f3ab9cd3d8 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
@@ -54,6 +54,11 @@ DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uin
_successor = _prefix;
}
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased)
+ : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, LevenshteinDfa::DfaType::Table)
+{
+}
+
DfaFuzzyMatcher::~DfaFuzzyMatcher() = default;
const char*
@@ -69,10 +74,10 @@ DfaFuzzyMatcher::skip_prefix(const char* word) const
}
bool
-DfaFuzzyMatcher::is_match(const char* word) const
+DfaFuzzyMatcher::is_match(std::string_view word) const
{
if (_prefix_size > 0) {
- Utf8ReaderForZTS reader(word);
+ Utf8Reader reader(word.data(), word.size());
size_t pos = 0;
for (; pos < _prefix.size() && reader.hasMore(); ++pos) {
uint32_t code_point = reader.getChar();
@@ -89,7 +94,7 @@ DfaFuzzyMatcher::is_match(const char* word) const
if (pos != _prefix_size) {
return false;
}
- word = reader.get_current_ptr();
+ word = word.substr(reader.getPos());
}
auto match = _dfa.match(word);
return match.matches();
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
index 51457129637..653af602c0d 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -27,9 +27,14 @@ private:
const char* skip_prefix(const char* word) const;
public:
DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
+ DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased); // Defaults to table-based DFA
~DfaFuzzyMatcher();
- bool is_match(const char *word) const;
+ [[nodiscard]] static constexpr bool supports_max_edits(uint8_t edits) noexcept {
+ return (edits == 1 || edits == 2);
+ }
+
+ [[nodiscard]] bool is_match(std::string_view word) const;
/*
* If prefix size is nonzero then this variant of is_match()
@@ -40,7 +45,7 @@ public:
* functionality in the dictionary.
*/
template <typename DictionaryConstIteratorType>
- bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store);
+ [[nodiscard]] bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store);
};
}
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
index 75885aa0402..f1a643dc376 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
@@ -80,7 +80,8 @@ StringSearchHelper::isMatch(const char *src) const noexcept {
return getRegex().valid() && getRegex().partial_match(std::string_view(src));
}
if (__builtin_expect(isFuzzy(), false)) {
- return _dfa_fuzzy_matcher ? _dfa_fuzzy_matcher->is_match(src) : getFuzzyMatcher().isMatch(src);
+ return _dfa_fuzzy_matcher ? _dfa_fuzzy_matcher->is_match(std::string_view(src))
+ : getFuzzyMatcher().isMatch(std::string_view(src));
}
if (__builtin_expect(isCased(), false)) {
int res = strncmp(_term, src, _termLen);