Merge pull request #28557 from vespa-engine/geirst/dfa_fuzzy_matcher

Class that uses a LevenshteinDfa to perform fuzzy matching in a dictionary
author: Tor Egge <Tor.Egge@yahooinc.com> 2023-09-18 15:11:30 +0200
committer: GitHub <noreply@github.com> 2023-09-18 15:11:30 +0200
commit: d6fc10119eb59a0a851adab6c493e3a5bfbd3065 (patch)
tree: 326d12e6767fb4addfb00f07eebb2b485a6db566
parent: ebbb044d0830c38d9ceb98425c5b666c7627d42b (diff)
parent: e9007c181c796c90d165769fe575fbc790fa96b8 (diff)
6 files changed, 312 insertions, 0 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index b83c5912680..f7cd8512f07 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -75,6 +75,7 @@ vespa_define_module(
     src/tests/attribute/bitvector_search_cache
     src/tests/attribute/changevector
     src/tests/attribute/compaction
+    src/tests/attribute/dfa_fuzzy_matcher
     src/tests/attribute/document_weight_iterator
     src/tests/attribute/document_weight_or_filter_search
     src/tests/attribute/enum_attribute_compaction
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt
new file mode 100644
index 00000000000..905e7a493da
--- /dev/null
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_attribute_dfa_fuzzy_matcher_test_app TEST
+    SOURCES
+    dfa_fuzzy_matcher_test.cpp
+    DEPENDS
+    searchlib
+    GTest::GTest
+)
+vespa_add_test(NAME searchlib_attribute_dfa_fuzzy_matcher_test_app COMMAND searchlib_attribute_dfa_fuzzy_matcher_test_app)
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
new file mode 100644
index 00000000000..c7bd0e917f3
--- /dev/null
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -0,0 +1,245 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchcommon/common/dictionary_config.h>
+#include <vespa/searchlib/attribute/dfa_fuzzy_matcher.h>
+#include <vespa/searchlib/attribute/enumstore.h>
+#include <vespa/searchlib/attribute/i_enum_store_dictionary.h>
+#include <vespa/vespalib/fuzzy/fuzzy_matcher.h>
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vespalib/util/time.h>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+
+namespace fs = std::filesystem;
+
+static std::string benchmark_dictionary;
+static size_t dfa_words_to_match = 1000;
+static size_t brute_force_words_to_match = 0;
+
+bool benchmarking_enabled() {
+    return !benchmark_dictionary.empty();
+}
+
+using namespace search::attribute;
+using namespace search;
+using vespalib::FuzzyMatcher;
+using vespalib::datastore::AtomicEntryRef;
+using vespalib::fuzzy::LevenshteinDfa;
+
+using StringEnumStore = EnumStoreT<const char*>;
+using DictionaryEntry = std::pair<std::string, size_t>;
+using RawDictionary = std::vector<DictionaryEntry>;
+using StringVector = std::vector<std::string>;
+
+RawDictionary
+read_dictionary()
+{
+    RawDictionary result;
+    std::ifstream file(benchmark_dictionary);
+    if (!file.is_open()) {
+        std::cerr << "Could not open benchmark dictionary file '" << benchmark_dictionary << "'" << std::endl;
+        return result;
+    }
+    std::string line;
+    std::string word;
+    size_t freq;
+
+    /**
+     * Each line in the dictionary file should be on this format:
+     * word\tfrequency\n
+     *
+     * This is the same format used when dumping a disk index dictionary using 'vespa-index-inspect dumpwords'.
+     * See https://docs.vespa.ai/en/reference/vespa-cmdline-tools.html#vespa-index-inspect.
+     */
+    while (std::getline(file, line)) {
+        std::istringstream iss(line);
+        if (std::getline(iss, word, '\t') && iss >> freq) {
+            result.emplace_back(word, freq);
+        } else {
+            std::cerr << "Invalid line: '" << line << "'" << std::endl;
+        }
+    }
+    file.close();
+    return result;
+}
+
+StringVector
+to_string_vector(const RawDictionary& dict)
+{
+    StringVector result;
+    for (const auto& entry : dict) {
+        result.push_back(entry.first);
+    }
+    return result;
+}
+
+void
+sort_by_freq(RawDictionary& dict)
+{
+    std::sort(dict.begin(), dict.end(),
+              [](const DictionaryEntry& lhs, const DictionaryEntry& rhs) {
+        return lhs.second > rhs.second;
+    });
+}
+
+struct MatchStats {
+    size_t matches;
+    size_t seeks;
+    vespalib::duration elapsed;
+    size_t samples;
+    MatchStats() : matches(0), seeks(0), elapsed(0), samples(0) {}
+    void add_sample(size_t matches_in, size_t seeks_in, vespalib::duration elapsed_in) {
+        matches += matches_in;
+        seeks += seeks_in;
+        elapsed += elapsed_in;
+        ++samples;
+    }
+    double avg_matches() const {
+        return (double) matches / samples;
+    }
+    double avg_seeks() const {
+        return (double) seeks / samples;
+    }
+    double avg_elapsed_ms() const {
+        return (double) vespalib::count_ms(elapsed) / samples;
+    }
+};
+
+template <bool collect_matches>
+void
+brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words)
+{
+    auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
+    vespalib::Timer timer;
+    FuzzyMatcher matcher(target, 2, 0, false);
+    auto itr = view.begin();
+    size_t matches = 0;
+    size_t seeks = 0;
+    while (itr.valid()) {
+        auto word = store.get_value(itr.getKey().load_relaxed());
+        if (matcher.isMatch(word)) {
+            ++matches;
+            if (collect_matches) {
+                matched_words.push_back(word);
+            }
+        }
+        ++seeks;
+        ++itr;
+    }
+    stats.add_sample(matches, seeks, timer.elapsed());
+}
+
+template <bool collect_matches>
+void
+dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words)
+{
+    auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
+    vespalib::Timer timer;
+    DfaFuzzyMatcher matcher(target, 2, false, LevenshteinDfa::DfaType::Explicit);
+    auto itr = view.begin();
+    size_t matches = 0;
+    size_t seeks = 0;
+    while (itr.valid()) {
+        auto word = store.get_value(itr.getKey().load_relaxed());
+        if (matcher.is_match(word, itr, store.get_data_store())) {
+            ++itr;
+            ++matches;
+            if (collect_matches) {
+                matched_words.push_back(word);
+            }
+        } else {
+            ++seeks;
+        }
+    }
+    stats.add_sample(matches, seeks, timer.elapsed());
+}
+
+struct DfaFuzzyMatcherTest : public ::testing::Test {
+    StringEnumStore store;
+    DfaFuzzyMatcherTest()
+        : store(true, DictionaryConfig(DictionaryConfig::Type::BTREE, DictionaryConfig::Match::UNCASED))
+    {}
+    void populate_dictionary(const StringVector& words) {
+        auto updater = store.make_batch_updater();
+        for (const auto& word : words) {
+            auto ref = updater.insert(word.c_str());
+            updater.inc_ref_count(ref);
+        }
+        updater.commit();
+        store.freeze_dictionary();
+    }
+    void expect_matches(std::string_view target, const StringVector& exp_matches) {
+        MatchStats stats;
+        StringVector brute_force_matches;
+        StringVector dfa_matches;
+        brute_force_fuzzy_match_in_dictionary<true>(target, store, stats, brute_force_matches);
+        dfa_fuzzy_match_in_dictionary<true>(target, store, stats, dfa_matches);
+        EXPECT_EQ(exp_matches, brute_force_matches);
+        EXPECT_EQ(exp_matches, dfa_matches);
+    }
+};
+
+TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
+{
+    StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
+                           "for", "forbid", "force", "ford", "forearm", "forecast", "forest" };
+    populate_dictionary(words);
+    expect_matches("board", {"board", "boat", "ford"});
+    expect_matches("food", {"door", "food", "foot", "for", "ford"});
+    expect_matches("foothill", {"football", "foothill"});
+    expect_matches("for", {"bob", "door", "food", "foot", "for", "force", "ford"});
+    expect_matches("force", {"for", "force", "ford"});
+    expect_matches("forcecast", {"forecast"});
+}
+
+void
+benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool dfa_algorithm)
+{
+    MatchStats stats;
+    StringVector dummy;
+    for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) {
+        const auto& entry = dict[i];
+        if (dfa_algorithm) {
+            dfa_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy);
+        } else {
+            brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy);
+        }
+    }
+    std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl;
+}
+
+TEST_F(DfaFuzzyMatcherTest, benchmark_fuzzy_match_in_dictionary)
+{
+    if (!benchmarking_enabled()) {
+        GTEST_SKIP() << "benchmarking not enabled";
+    }
+    auto dict = read_dictionary();
+    populate_dictionary(to_string_vector(dict));
+    std::cout << "Unique words: " << store.get_num_uniques() << std::endl;
+    sort_by_freq(dict);
+    benchmark_fuzzy_match_in_dictionary(store, dict, dfa_words_to_match, true);
+    benchmark_fuzzy_match_in_dictionary(store, dict, brute_force_words_to_match, false);
+}
+
+int
+main(int argc, char** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    if (argc > 1) {
+        benchmark_dictionary = argv[1];
+        if (!fs::exists(fs::path(benchmark_dictionary))) {
+            std::cerr << "Benchmark dictionary file '" << benchmark_dictionary << "' does not exist" << std::endl;
+            return 1;
+        }
+        if (argc > 2) {
+            dfa_words_to_match = std::stoi(argv[2]);
+        }
+        if (argc > 3) {
+            brute_force_words_to_match = std::stoi(argv[3]);
+        }
+    }
+    return RUN_ALL_TESTS();
+}
+
diff --git a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
index 26f714258eb..58d3ec0298a 100644
--- a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
@@ -36,6 +36,7 @@ vespa_add_library(searchlib_attribute OBJECT
     createsinglefastsearch.cpp
     createsinglestd.cpp
     defines.cpp
+    dfa_fuzzy_matcher.cpp
     dfa_string_comparator.cpp
     distance_metric_utils.cpp
     diversity.cpp
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
new file mode 100644
index 00000000000..580c34bd5d0
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
@@ -0,0 +1,17 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "dfa_fuzzy_matcher.h"
+
+using vespalib::fuzzy::LevenshteinDfa;
+
+namespace search::attribute {
+
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, LevenshteinDfa::DfaType dfa_type)
+    : _dfa(vespalib::fuzzy::LevenshteinDfa::build(target, max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)),
+      _successor()
+{
+}
+
+DfaFuzzyMatcher::~DfaFuzzyMatcher() = default;
+
+}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
new file mode 100644
index 00000000000..072eb5e1333
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -0,0 +1,39 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "dfa_string_comparator.h"
+#include <vespa/vespalib/datastore/atomic_entry_ref.h>
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
+
+namespace search::attribute {
+
+/**
+ * Class that uses a LevenshteinDfa to fuzzy match a target word against words in a dictionary.
+ *
+ * The dictionary iterator is advanced based on the successor string from the DFA
+ * each time the candidate word is _not_ a match.
+ */
+class DfaFuzzyMatcher {
+private:
+    vespalib::fuzzy::LevenshteinDfa _dfa;
+    std::string _successor;
+
+public:
+    DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
+    ~DfaFuzzyMatcher();
+
+    template <typename DictionaryConstIteratorType>
+    bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) {
+        auto match = _dfa.match(word, &_successor);
+        if (match.matches()) {
+            return true;
+        } else {
+            DfaStringComparator cmp(data_store, _successor.c_str());
+            itr.seek(vespalib::datastore::AtomicEntryRef(), cmp);
+            return false;
+        }
+    }
+};
+
+}
author	Tor Egge <Tor.Egge@yahooinc.com>	2023-09-18 15:11:30 +0200
committer	GitHub <noreply@github.com>	2023-09-18 15:11:30 +0200
commit	d6fc10119eb59a0a851adab6c493e3a5bfbd3065 (patch)
tree	326d12e6767fb4addfb00f07eebb2b485a6db566
parent	ebbb044d0830c38d9ceb98425c5b666c7627d42b (diff)
parent	e9007c181c796c90d165769fe575fbc790fa96b8 (diff)