diff options
author | Tor Brede Vekterli <vekterli@vespa.ai> | 2024-01-18 10:21:52 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@vespa.ai> | 2024-01-18 10:28:21 +0000 |
commit | a8476292a061bf609e24e2731202b634bba46b95 (patch) | |
tree | 75f906e26551d545fd49f24b0c9be9e9c74dcab2 /searchlib | |
parent | ada510e89058979aecd66598d1853fd09ee0b42d (diff) |
Use `string_view` for standalone DFA fuzzy match function
This is to allow for matching non-zero terminated strings,
which will be needed if we are to support fuzzy matching of
streaming tokenized terms.
Note that the underlying DFA API already takes in a string_view,
so this is just moving an implicit `strlen` one step higher in the
stack. This will pull the string data into the CPU cache ever so
slightly earlier than before, but this is not expected to matter
in practice since the data will be used immediately after anyway.
Added explicit `string_view` instantiations at the call sites
to make implicit `strlen` more obviously visible than they
currently are with implicit type conversions from `const char*`.
Also adds a utility constructor to hide default underlying DFA
implementation.
Diffstat (limited to 'searchlib')
4 files changed, 18 insertions, 7 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp index 433ad9e7671..8ba8c62c5ff 100644 --- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp +++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp @@ -197,7 +197,7 @@ dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumS size_t seeks = 0; for (;itr.valid(); ++itr) { auto word = store.get_value(itr.getKey().load_relaxed()); - if (matcher.is_match(word)) { + if (matcher.is_match(std::string_view(word))) { ++matches; if (collect_matches) { matched_words.push_back(word); diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp index 9d6e2e9815d..5f3ab9cd3d8 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp @@ -54,6 +54,11 @@ DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uin _successor = _prefix; } +DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased) + : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, LevenshteinDfa::DfaType::Table) +{ +} + DfaFuzzyMatcher::~DfaFuzzyMatcher() = default; const char* @@ -69,10 +74,10 @@ DfaFuzzyMatcher::skip_prefix(const char* word) const } bool -DfaFuzzyMatcher::is_match(const char* word) const +DfaFuzzyMatcher::is_match(std::string_view word) const { if (_prefix_size > 0) { - Utf8ReaderForZTS reader(word); + Utf8Reader reader(word.data(), word.size()); size_t pos = 0; for (; pos < _prefix.size() && reader.hasMore(); ++pos) { uint32_t code_point = reader.getChar(); @@ -89,7 +94,7 @@ DfaFuzzyMatcher::is_match(const char* word) const if (pos != _prefix_size) { return false; } - word = reader.get_current_ptr(); + word = word.substr(reader.getPos()); } auto match = _dfa.match(word); return match.matches(); diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h index 51457129637..653af602c0d 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h @@ -27,9 +27,14 @@ private: const char* skip_prefix(const char* word) const; public: DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); + DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased); // Defaults to table-based DFA ~DfaFuzzyMatcher(); - bool is_match(const char *word) const; + [[nodiscard]] static constexpr bool supports_max_edits(uint8_t edits) noexcept { + return (edits == 1 || edits == 2); + } + + [[nodiscard]] bool is_match(std::string_view word) const; /* * If prefix size is nonzero then this variant of is_match() @@ -40,7 +45,7 @@ public: * functionality in the dictionary. */ template <typename DictionaryConstIteratorType> - bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store); + [[nodiscard]] bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store); }; } diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp index 75885aa0402..f1a643dc376 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp @@ -80,7 +80,8 @@ StringSearchHelper::isMatch(const char *src) const noexcept { return getRegex().valid() && getRegex().partial_match(std::string_view(src)); } if (__builtin_expect(isFuzzy(), false)) { - return _dfa_fuzzy_matcher ? _dfa_fuzzy_matcher->is_match(src) : getFuzzyMatcher().isMatch(src); + return _dfa_fuzzy_matcher ? _dfa_fuzzy_matcher->is_match(std::string_view(src)) + : getFuzzyMatcher().isMatch(std::string_view(src)); } if (__builtin_expect(isCased(), false)) { int res = strncmp(_term, src, _termLen); |