Use `string_view` for standalone DFA fuzzy match function

This is to allow for matching non-zero terminated strings, which will be needed if we are to support fuzzy matching of streaming tokenized terms. Note that the underlying DFA API already takes in a string_view, so this is just moving an implicit `strlen` one step higher in the stack. This will pull the string data into the CPU cache ever so slightly earlier than before, but this is not expected to matter in practice since the data will be used immediately after anyway. Added explicit `string_view` instantiations at the call sites to make implicit `strlen` more obviously visible than they currently are with implicit type conversions from `const char*`. Also adds a utility constructor to hide default underlying DFA implementation.
author: Tor Brede Vekterli <vekterli@vespa.ai> 2024-01-18 10:21:52 +0000
committer: Tor Brede Vekterli <vekterli@vespa.ai> 2024-01-18 10:28:21 +0000
commit: a8476292a061bf609e24e2731202b634bba46b95 (patch)
tree: 75f906e26551d545fd49f24b0c9be9e9c74dcab2 /searchlib
parent: ada510e89058979aecd66598d1853fd09ee0b42d (diff)
4 files changed, 18 insertions, 7 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
index 433ad9e7671..8ba8c62c5ff 100644
--- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -197,7 +197,7 @@ dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumS
     size_t seeks = 0;
     for (;itr.valid(); ++itr) {
         auto word = store.get_value(itr.getKey().load_relaxed());
-        if (matcher.is_match(word)) {
+        if (matcher.is_match(std::string_view(word))) {
             ++matches;
             if (collect_matches) {
                 matched_words.push_back(word);
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
index 9d6e2e9815d..5f3ab9cd3d8 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
@@ -54,6 +54,11 @@ DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uin
     _successor = _prefix;
 }
 
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased)
+    : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, LevenshteinDfa::DfaType::Table)
+{
+}
+
 DfaFuzzyMatcher::~DfaFuzzyMatcher() = default;
 
 const char*
@@ -69,10 +74,10 @@ DfaFuzzyMatcher::skip_prefix(const char* word) const
 }
 
 bool
-DfaFuzzyMatcher::is_match(const char* word) const
+DfaFuzzyMatcher::is_match(std::string_view word) const
 {
     if (_prefix_size > 0) {
-        Utf8ReaderForZTS reader(word);
+        Utf8Reader reader(word.data(), word.size());
         size_t pos = 0;
         for (; pos < _prefix.size() && reader.hasMore(); ++pos) {
             uint32_t code_point = reader.getChar();
@@ -89,7 +94,7 @@ DfaFuzzyMatcher::is_match(const char* word) const
         if (pos != _prefix_size) {
             return false;
         }
-        word = reader.get_current_ptr();
+        word = word.substr(reader.getPos());
     }
     auto match = _dfa.match(word);
     return match.matches();
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
index 51457129637..653af602c0d 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -27,9 +27,14 @@ private:
     const char* skip_prefix(const char* word) const;
 public:
     DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
+    DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased); // Defaults to table-based DFA
     ~DfaFuzzyMatcher();
 
-    bool is_match(const char *word) const;
+    [[nodiscard]] static constexpr bool supports_max_edits(uint8_t edits) noexcept {
+        return (edits == 1 || edits == 2);
+    }
+
+    [[nodiscard]] bool is_match(std::string_view word) const;
 
     /*
      * If prefix size is nonzero then this variant of is_match()
@@ -40,7 +45,7 @@ public:
      * functionality in the dictionary.
      */
     template <typename DictionaryConstIteratorType>
-    bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store);
+    [[nodiscard]] bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store);
 };
 
 }
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
index 75885aa0402..f1a643dc376 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
@@ -80,7 +80,8 @@ StringSearchHelper::isMatch(const char *src) const noexcept {
         return getRegex().valid() && getRegex().partial_match(std::string_view(src));
     }
     if (__builtin_expect(isFuzzy(), false)) {
-        return _dfa_fuzzy_matcher ? _dfa_fuzzy_matcher->is_match(src) : getFuzzyMatcher().isMatch(src);
+        return _dfa_fuzzy_matcher ? _dfa_fuzzy_matcher->is_match(std::string_view(src))
+                                  : getFuzzyMatcher().isMatch(std::string_view(src));
     }
     if (__builtin_expect(isCased(), false)) {
         int res = strncmp(_term, src, _termLen);
author	Tor Brede Vekterli <vekterli@vespa.ai>	2024-01-18 10:21:52 +0000
committer	Tor Brede Vekterli <vekterli@vespa.ai>	2024-01-18 10:28:21 +0000
commit	a8476292a061bf609e24e2731202b634bba46b95 (patch)
tree	75f906e26551d545fd49f24b0c9be9e9c74dcab2 /searchlib
parent	ada510e89058979aecd66598d1853fd09ee0b42d (diff)