Add another is_match member function to dfa fuzzy matcher that doesn't

try to update directory iterator.
author: Tor Egge <Tor.Egge@online.no> 2023-09-25 15:51:11 +0200
committer: Tor Egge <Tor.Egge@online.no> 2023-09-25 15:51:11 +0200
commit: 532e823b64a036e3894241507fd49b1ab79ccead (patch)
tree: 2ef7f755bee4cfab37371fa8de54555edb2144d8 /searchlib
parent: 27472f94770a6644d44b765cedf802d8bb38ac03 (diff)
3 files changed, 172 insertions, 27 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
index 77e23a58163..c2a39779061 100644
--- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -8,6 +8,7 @@
 #include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
 #include <vespa/vespalib/gtest/gtest.h>
 #include <vespa/vespalib/util/time.h>
+#include <vespa/vespalib/text/utf8.h>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
@@ -28,12 +29,22 @@ using vespalib::FuzzyMatcher;
 using vespalib::datastore::AtomicEntryRef;
 using vespalib::datastore::EntryRef;
 using vespalib::fuzzy::LevenshteinDfa;
+using vespalib::Utf8Reader;
+using vespalib::Utf8Writer;
 
 using StringEnumStore = EnumStoreT<const char*>;
 using DictionaryEntry = std::pair<std::string, size_t>;
 using RawDictionary = std::vector<DictionaryEntry>;
 using StringVector = std::vector<std::string>;
 
+namespace {
+
+const char* char_from_u8(const char8_t* p) {
+    return reinterpret_cast<const char*>(p);
+}
+
+}
+
 RawDictionary
 read_dictionary()
 {
@@ -110,11 +121,11 @@ struct MatchStats {
 
 template <bool collect_matches>
 void
-brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words)
+brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
 {
     auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
     vespalib::Timer timer;
-    FuzzyMatcher matcher(target, 2, prefix_size, false);
+    FuzzyMatcher matcher(target, 2, prefix_size, cased);
     auto itr = view.begin();
     size_t matches = 0;
     size_t seeks = 0;
@@ -134,12 +145,18 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS
 
 template <bool collect_matches>
 void
-dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words)
+dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
 {
     auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
     vespalib::Timer timer;
-    DfaFuzzyMatcher matcher(target, 2, prefix_size, false, LevenshteinDfa::DfaType::Explicit);
-    std::string target_copy(target.substr(0, prefix_size));
+    DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit);
+    Utf8Reader reader(vespalib::stringref(target.data(), target.size()));
+    std::string target_copy;
+    Utf8Writer<std::string> writer(target_copy);
+    for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) {
+        auto code_point = reader.getChar();
+        writer.putChar(code_point);
+    }
     auto prefix_cmp = store.make_folded_comparator_prefix(target_copy.c_str());
     auto itr = prefix_size > 0 ? view.lowerBound(AtomicEntryRef(), prefix_cmp) : view.begin();
     auto itr_end = itr;
@@ -169,10 +186,58 @@ dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& st
     stats.add_sample(matches, seeks, timer.elapsed());
 }
 
-struct DfaFuzzyMatcherTest : public ::testing::Test {
+template <bool collect_matches>
+void
+dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+{
+    auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
+    vespalib::Timer timer;
+    DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit);
+    auto itr = view.begin();
+    size_t matches = 0;
+    size_t seeks = 0;
+    for (;itr.valid(); ++itr) {
+        auto word = store.get_value(itr.getKey().load_relaxed());
+        if (matcher.is_match(word)) {
+            ++matches;
+            if (collect_matches) {
+                matched_words.push_back(word);
+            }
+        } else {
+            ++seeks;
+        }
+    }
+    stats.add_sample(matches, seeks, timer.elapsed());
+}
+
+struct TestParam
+{
+    vespalib::string _name;
+    bool             _cased;
+
+    TestParam(vespalib::string name, bool cased)
+        : _name(std::move(name)),
+          _cased(cased)
+    {
+    }
+    TestParam(const TestParam&);
+    ~TestParam();
+};
+
+TestParam::TestParam(const TestParam&) = default;
+
+TestParam::~TestParam() = default;
+
+std::ostream& operator<<(std::ostream& os, const TestParam& param)
+{
+    os << param._name;
+    return os;
+}
+
+struct DfaFuzzyMatcherTest : public ::testing::TestWithParam<TestParam> {
     StringEnumStore store;
     DfaFuzzyMatcherTest()
-        : store(true, DictionaryConfig(DictionaryConfig::Type::BTREE, DictionaryConfig::Match::UNCASED))
+        : store(true, DictionaryConfig(DictionaryConfig::Type::BTREE, GetParam()._cased ? DictionaryConfig::Match::CASED : DictionaryConfig::Match::UNCASED))
     {}
     void populate_dictionary(const StringVector& words) {
         auto updater = store.make_batch_updater();
@@ -187,18 +252,27 @@ struct DfaFuzzyMatcherTest : public ::testing::Test {
         MatchStats stats;
         StringVector brute_force_matches;
         StringVector dfa_matches;
+        StringVector dfa_no_skip_matches;
+        bool cased = GetParam()._cased;
         SCOPED_TRACE(target);
-        brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, brute_force_matches);
-        dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, dfa_matches);
+        brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, brute_force_matches);
+        dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, dfa_matches);
+        dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, stats, dfa_no_skip_matches);
         EXPECT_EQ(exp_matches, brute_force_matches);
         EXPECT_EQ(exp_matches, dfa_matches);
+        EXPECT_EQ(exp_matches, dfa_no_skip_matches);
     }
     void expect_matches(std::string_view target, const StringVector& exp_matches) {
         expect_prefix_matches(target, 0, exp_matches);
     }
 };
 
-TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
+INSTANTIATE_TEST_SUITE_P(DfaFuzzyMatcherMultiTest,
+                         DfaFuzzyMatcherTest,
+                         testing::Values(TestParam("uncased", false), TestParam("cased", true)),
+                         testing::PrintToStringParamName());
+
+TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
 {
     StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
                            "for", "forbid", "force", "ford", "forearm", "forecast", "forest" };
@@ -211,10 +285,11 @@ TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
     expect_matches("forcecast", {"forecast"});
 }
 
-TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
+TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
 {
+    bool cased = GetParam()._cased;
     StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
-                           "for", "forbid", "force", "ford", "forearm", "forecast", "forest" };
+                           "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")};
     populate_dictionary(words);
     expect_prefix_matches("a", 1, {});
     expect_prefix_matches("b", 1, {"bob"});
@@ -231,25 +306,46 @@ TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
     expect_prefix_matches("forcecast", 1, {"forecast"});
     expect_prefix_matches("forcecast", 4, {});
     expect_prefix_matches("z", 1, {});
+    if (cased) {
+        expect_prefix_matches("h", 1, {"h", "ha"});
+        expect_prefix_matches(char_from_u8(u8"Ø"), 1, {char_from_u8(u8"Ørn")});
+        expect_prefix_matches(char_from_u8(u8"ø"), 1, {char_from_u8(u8"øre")});
+        expect_prefix_matches(char_from_u8(u8"å"), 1, {char_from_u8(u8"ås")});
+        /* Corner case: prefix length > target length means exact match */
+        expect_prefix_matches("h", 2, {"h"});
+    } else {
+        expect_prefix_matches("h", 1, {"H", "h", "HA", "ha"});
+        expect_prefix_matches(char_from_u8(u8"ø"), 1, {char_from_u8(u8"øre"), char_from_u8(u8"Ørn")});
+        expect_prefix_matches(char_from_u8(u8"å"), 1, {char_from_u8(u8"Ås"), char_from_u8(u8"ås")});
+        /* Corner case: prefix length > target length means exact match */
+        expect_prefix_matches("h", 2, {"H", "h"});
+    }
 }
 
 void
-benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool dfa_algorithm)
+benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool cased, bool dfa_algorithm)
 {
     MatchStats stats;
     StringVector dummy;
     for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) {
         const auto& entry = dict[i];
         if (dfa_algorithm) {
-            dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy);
+            dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy);
         } else {
-            brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy);
+            brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy);
         }
     }
     std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl;
 }
 
-TEST_F(DfaFuzzyMatcherTest, benchmark_fuzzy_match_in_dictionary)
+using DfaFuzzyMatcherBenchmarkTest = DfaFuzzyMatcherTest;
+
+INSTANTIATE_TEST_SUITE_P(DfaFuzzyMatcherBenchmarkMultiTest,
+                         DfaFuzzyMatcherBenchmarkTest,
+                         testing::Values(TestParam("uncased", false)),
+                         testing::PrintToStringParamName());
+
+TEST_P(DfaFuzzyMatcherBenchmarkTest, benchmark_fuzzy_match_in_dictionary)
 {
     if (!benchmarking_enabled()) {
         GTEST_SKIP() << "benchmarking not enabled";
@@ -258,8 +354,9 @@ TEST_F(DfaFuzzyMatcherTest, benchmark_fuzzy_match_in_dictionary)
     populate_dictionary(to_string_vector(dict));
     std::cout << "Unique words: " << store.get_num_uniques() << std::endl;
     sort_by_freq(dict);
-    benchmark_fuzzy_match_in_dictionary(store, dict, dfa_words_to_match, true);
-    benchmark_fuzzy_match_in_dictionary(store, dict, brute_force_words_to_match, false);
+    bool cased = GetParam()._cased;
+    benchmark_fuzzy_match_in_dictionary(store, dict, dfa_words_to_match, cased, true);
+    benchmark_fuzzy_match_in_dictionary(store, dict, brute_force_words_to_match, cased, false);
 }
 
 int
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
index f66dd56c72f..b16fdc12a9a 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
@@ -48,7 +48,8 @@ DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uin
       _successor(),
       _prefix(extract_prefix(target, prefix_size, cased)),
       _successor_suffix(),
-      _prefix_size(prefix_size)
+      _prefix_size(prefix_size),
+      _cased(cased)
 {
     _successor = _prefix;
 }
@@ -60,11 +61,38 @@ DfaFuzzyMatcher::skip_prefix(const char* word) const
 {
     Utf8ReaderForZTS reader(word);
     size_t pos = 0;
-    for (; pos < _prefix_size && reader.hasMore(); ++pos) {
+    for (; pos < _prefix.size() && reader.hasMore(); ++pos) {
         (void) reader.getChar();
     }
-    assert(pos == _prefix_size);
+    assert(pos == _prefix.size());
     return reader.get_current_ptr();
 }
 
+bool
+DfaFuzzyMatcher::is_match(const char* word) const
+{
+    if (_prefix_size > 0) {
+        Utf8ReaderForZTS reader(word);
+        size_t pos = 0;
+        for (; pos < _prefix.size() && reader.hasMore(); ++pos) {
+            uint32_t code_point = reader.getChar();
+            if (!_cased) {
+                code_point = LowerCase::convert(code_point);
+            }
+            if (code_point != _prefix[pos]) {
+                break;
+            }
+        }
+        if (!reader.hasMore() && pos == _prefix.size() && pos < _prefix_size) {
+            return true;
+        }
+        if (pos != _prefix_size) {
+            return false;
+        }
+        word = reader.get_current_ptr();
+    }
+    auto match = _dfa.match(word);
+    return match.matches();
+}
+
 }
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
index 3d24948e044..7116b4d8662 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -5,6 +5,7 @@
 #include "dfa_string_comparator.h"
 #include <vespa/vespalib/datastore/atomic_entry_ref.h>
 #include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
+#include <iostream>
 
 namespace search::attribute {
 
@@ -21,22 +22,41 @@ private:
     std::vector<uint32_t>           _prefix;
     std::vector<uint32_t>           _successor_suffix;
     uint32_t                        _prefix_size;
+    bool                            _cased;
 
-    const char*skip_prefix(const char* word) const;
+    const char* skip_prefix(const char* word) const;
 public:
     DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
     ~DfaFuzzyMatcher();
 
+    bool is_match(const char *word) const;
+
+    /*
+     * If prefix size is nonzero then this variant of is_match()
+     * should only be called with words that starts with the extracted
+     * prefix of the target word.
+     *
+     * Caller must position iterator at right location using lower bound
+     * functionality in the dictionary.
+     */
     template <typename DictionaryConstIteratorType>
     bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) {
         if (_prefix_size > 0) {
             word = skip_prefix(word);
-            auto match = _dfa.match(word, _successor_suffix);
-            if (match.matches()) {
-                return true;
+            if (_prefix.size() < _prefix_size) {
+                if (*word == '\0') {
+                    return true;
+                }
+                _successor.resize(_prefix.size());
+                _successor.emplace_back(1);
+            } else {
+                auto match = _dfa.match(word, _successor_suffix);
+                if (match.matches()) {
+                    return true;
+                }
+                _successor.resize(_prefix.size());
+                _successor.insert(_successor.end(), _successor_suffix.begin(), _successor_suffix.end());
             }
-            _successor.resize(_prefix.size());
-            _successor.insert(_successor.end(), _successor_suffix.begin(), _successor_suffix.end());
         } else {
             auto match = _dfa.match(word, _successor);
             if (match.matches()) {
author	Tor Egge <Tor.Egge@online.no>	2023-09-25 15:51:11 +0200
committer	Tor Egge <Tor.Egge@online.no>	2023-09-25 15:51:11 +0200
commit	532e823b64a036e3894241507fd49b1ab79ccead (patch)
tree	2ef7f755bee4cfab37371fa8de54555edb2144d8 /searchlib
parent	27472f94770a6644d44b765cedf802d8bb38ac03 (diff)