summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@yahooinc.com>2023-09-18 15:11:30 +0200
committerGitHub <noreply@github.com>2023-09-18 15:11:30 +0200
commitd6fc10119eb59a0a851adab6c493e3a5bfbd3065 (patch)
tree326d12e6767fb4addfb00f07eebb2b485a6db566
parentebbb044d0830c38d9ceb98425c5b666c7627d42b (diff)
parente9007c181c796c90d165769fe575fbc790fa96b8 (diff)
Merge pull request #28557 from vespa-engine/geirst/dfa_fuzzy_matcher
Class that uses a LevenshteinDfa to perform fuzzy matching in a dictionary
-rw-r--r--searchlib/CMakeLists.txt1
-rw-r--r--searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt9
-rw-r--r--searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp245
-rw-r--r--searchlib/src/vespa/searchlib/attribute/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp17
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h39
6 files changed, 312 insertions, 0 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index b83c5912680..f7cd8512f07 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -75,6 +75,7 @@ vespa_define_module(
src/tests/attribute/bitvector_search_cache
src/tests/attribute/changevector
src/tests/attribute/compaction
+ src/tests/attribute/dfa_fuzzy_matcher
src/tests/attribute/document_weight_iterator
src/tests/attribute/document_weight_or_filter_search
src/tests/attribute/enum_attribute_compaction
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt
new file mode 100644
index 00000000000..905e7a493da
--- /dev/null
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_attribute_dfa_fuzzy_matcher_test_app TEST
+ SOURCES
+ dfa_fuzzy_matcher_test.cpp
+ DEPENDS
+ searchlib
+ GTest::GTest
+)
+vespa_add_test(NAME searchlib_attribute_dfa_fuzzy_matcher_test_app COMMAND searchlib_attribute_dfa_fuzzy_matcher_test_app)
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
new file mode 100644
index 00000000000..c7bd0e917f3
--- /dev/null
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -0,0 +1,245 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchcommon/common/dictionary_config.h>
+#include <vespa/searchlib/attribute/dfa_fuzzy_matcher.h>
+#include <vespa/searchlib/attribute/enumstore.h>
+#include <vespa/searchlib/attribute/i_enum_store_dictionary.h>
+#include <vespa/vespalib/fuzzy/fuzzy_matcher.h>
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vespalib/util/time.h>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+
+namespace fs = std::filesystem;
+
+static std::string benchmark_dictionary;
+static size_t dfa_words_to_match = 1000;
+static size_t brute_force_words_to_match = 0;
+
+bool benchmarking_enabled() {
+ return !benchmark_dictionary.empty();
+}
+
+using namespace search::attribute;
+using namespace search;
+using vespalib::FuzzyMatcher;
+using vespalib::datastore::AtomicEntryRef;
+using vespalib::fuzzy::LevenshteinDfa;
+
+using StringEnumStore = EnumStoreT<const char*>;
+using DictionaryEntry = std::pair<std::string, size_t>;
+using RawDictionary = std::vector<DictionaryEntry>;
+using StringVector = std::vector<std::string>;
+
+RawDictionary
+read_dictionary()
+{
+ RawDictionary result;
+ std::ifstream file(benchmark_dictionary);
+ if (!file.is_open()) {
+ std::cerr << "Could not open benchmark dictionary file '" << benchmark_dictionary << "'" << std::endl;
+ return result;
+ }
+ std::string line;
+ std::string word;
+ size_t freq;
+
+ /**
+ * Each line in the dictionary file should be on this format:
+ * word\tfrequency\n
+ *
+ * This is the same format used when dumping a disk index dictionary using 'vespa-index-inspect dumpwords'.
+ * See https://docs.vespa.ai/en/reference/vespa-cmdline-tools.html#vespa-index-inspect.
+ */
+ while (std::getline(file, line)) {
+ std::istringstream iss(line);
+ if (std::getline(iss, word, '\t') && iss >> freq) {
+ result.emplace_back(word, freq);
+ } else {
+ std::cerr << "Invalid line: '" << line << "'" << std::endl;
+ }
+ }
+ file.close();
+ return result;
+}
+
+StringVector
+to_string_vector(const RawDictionary& dict)
+{
+ StringVector result;
+ for (const auto& entry : dict) {
+ result.push_back(entry.first);
+ }
+ return result;
+}
+
+void
+sort_by_freq(RawDictionary& dict)
+{
+ std::sort(dict.begin(), dict.end(),
+ [](const DictionaryEntry& lhs, const DictionaryEntry& rhs) {
+ return lhs.second > rhs.second;
+ });
+}
+
+struct MatchStats {
+ size_t matches;
+ size_t seeks;
+ vespalib::duration elapsed;
+ size_t samples;
+ MatchStats() : matches(0), seeks(0), elapsed(0), samples(0) {}
+ void add_sample(size_t matches_in, size_t seeks_in, vespalib::duration elapsed_in) {
+ matches += matches_in;
+ seeks += seeks_in;
+ elapsed += elapsed_in;
+ ++samples;
+ }
+ double avg_matches() const {
+ return (double) matches / samples;
+ }
+ double avg_seeks() const {
+ return (double) seeks / samples;
+ }
+ double avg_elapsed_ms() const {
+ return (double) vespalib::count_ms(elapsed) / samples;
+ }
+};
+
+template <bool collect_matches>
+void
+brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words)
+{
+ auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
+ vespalib::Timer timer;
+ FuzzyMatcher matcher(target, 2, 0, false);
+ auto itr = view.begin();
+ size_t matches = 0;
+ size_t seeks = 0;
+ while (itr.valid()) {
+ auto word = store.get_value(itr.getKey().load_relaxed());
+ if (matcher.isMatch(word)) {
+ ++matches;
+ if (collect_matches) {
+ matched_words.push_back(word);
+ }
+ }
+ ++seeks;
+ ++itr;
+ }
+ stats.add_sample(matches, seeks, timer.elapsed());
+}
+
+template <bool collect_matches>
+void
+dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words)
+{
+ auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
+ vespalib::Timer timer;
+ DfaFuzzyMatcher matcher(target, 2, false, LevenshteinDfa::DfaType::Explicit);
+ auto itr = view.begin();
+ size_t matches = 0;
+ size_t seeks = 0;
+ while (itr.valid()) {
+ auto word = store.get_value(itr.getKey().load_relaxed());
+ if (matcher.is_match(word, itr, store.get_data_store())) {
+ ++itr;
+ ++matches;
+ if (collect_matches) {
+ matched_words.push_back(word);
+ }
+ } else {
+ ++seeks;
+ }
+ }
+ stats.add_sample(matches, seeks, timer.elapsed());
+}
+
+struct DfaFuzzyMatcherTest : public ::testing::Test {
+ StringEnumStore store;
+ DfaFuzzyMatcherTest()
+ : store(true, DictionaryConfig(DictionaryConfig::Type::BTREE, DictionaryConfig::Match::UNCASED))
+ {}
+ void populate_dictionary(const StringVector& words) {
+ auto updater = store.make_batch_updater();
+ for (const auto& word : words) {
+ auto ref = updater.insert(word.c_str());
+ updater.inc_ref_count(ref);
+ }
+ updater.commit();
+ store.freeze_dictionary();
+ }
+ void expect_matches(std::string_view target, const StringVector& exp_matches) {
+ MatchStats stats;
+ StringVector brute_force_matches;
+ StringVector dfa_matches;
+ brute_force_fuzzy_match_in_dictionary<true>(target, store, stats, brute_force_matches);
+ dfa_fuzzy_match_in_dictionary<true>(target, store, stats, dfa_matches);
+ EXPECT_EQ(exp_matches, brute_force_matches);
+ EXPECT_EQ(exp_matches, dfa_matches);
+ }
+};
+
+TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
+{
+ StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
+ "for", "forbid", "force", "ford", "forearm", "forecast", "forest" };
+ populate_dictionary(words);
+ expect_matches("board", {"board", "boat", "ford"});
+ expect_matches("food", {"door", "food", "foot", "for", "ford"});
+ expect_matches("foothill", {"football", "foothill"});
+ expect_matches("for", {"bob", "door", "food", "foot", "for", "force", "ford"});
+ expect_matches("force", {"for", "force", "ford"});
+ expect_matches("forcecast", {"forecast"});
+}
+
+void
+benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool dfa_algorithm)
+{
+ MatchStats stats;
+ StringVector dummy;
+ for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) {
+ const auto& entry = dict[i];
+ if (dfa_algorithm) {
+ dfa_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy);
+ } else {
+ brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy);
+ }
+ }
+ std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl;
+}
+
+TEST_F(DfaFuzzyMatcherTest, benchmark_fuzzy_match_in_dictionary)
+{
+ if (!benchmarking_enabled()) {
+ GTEST_SKIP() << "benchmarking not enabled";
+ }
+ auto dict = read_dictionary();
+ populate_dictionary(to_string_vector(dict));
+ std::cout << "Unique words: " << store.get_num_uniques() << std::endl;
+ sort_by_freq(dict);
+ benchmark_fuzzy_match_in_dictionary(store, dict, dfa_words_to_match, true);
+ benchmark_fuzzy_match_in_dictionary(store, dict, brute_force_words_to_match, false);
+}
+
+int
+main(int argc, char** argv)
+{
+ ::testing::InitGoogleTest(&argc, argv);
+ if (argc > 1) {
+ benchmark_dictionary = argv[1];
+ if (!fs::exists(fs::path(benchmark_dictionary))) {
+ std::cerr << "Benchmark dictionary file '" << benchmark_dictionary << "' does not exist" << std::endl;
+ return 1;
+ }
+ if (argc > 2) {
+ dfa_words_to_match = std::stoi(argv[2]);
+ }
+ if (argc > 3) {
+ brute_force_words_to_match = std::stoi(argv[3]);
+ }
+ }
+ return RUN_ALL_TESTS();
+}
+
diff --git a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
index 26f714258eb..58d3ec0298a 100644
--- a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
@@ -36,6 +36,7 @@ vespa_add_library(searchlib_attribute OBJECT
createsinglefastsearch.cpp
createsinglestd.cpp
defines.cpp
+ dfa_fuzzy_matcher.cpp
dfa_string_comparator.cpp
distance_metric_utils.cpp
diversity.cpp
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
new file mode 100644
index 00000000000..580c34bd5d0
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
@@ -0,0 +1,17 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "dfa_fuzzy_matcher.h"
+
+using vespalib::fuzzy::LevenshteinDfa;
+
+namespace search::attribute {
+
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, LevenshteinDfa::DfaType dfa_type)
+ : _dfa(vespalib::fuzzy::LevenshteinDfa::build(target, max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)),
+ _successor()
+{
+}
+
+DfaFuzzyMatcher::~DfaFuzzyMatcher() = default;
+
+}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
new file mode 100644
index 00000000000..072eb5e1333
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -0,0 +1,39 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "dfa_string_comparator.h"
+#include <vespa/vespalib/datastore/atomic_entry_ref.h>
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
+
+namespace search::attribute {
+
+/**
+ * Class that uses a LevenshteinDfa to fuzzy match a target word against words in a dictionary.
+ *
+ * The dictionary iterator is advanced based on the successor string from the DFA
+ * each time the candidate word is _not_ a match.
+ */
+class DfaFuzzyMatcher {
+private:
+ vespalib::fuzzy::LevenshteinDfa _dfa;
+ std::string _successor;
+
+public:
+ DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
+ ~DfaFuzzyMatcher();
+
+ template <typename DictionaryConstIteratorType>
+ bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) {
+ auto match = _dfa.match(word, &_successor);
+ if (match.matches()) {
+ return true;
+ } else {
+ DfaStringComparator cmp(data_store, _successor.c_str());
+ itr.seek(vespalib::datastore::AtomicEntryRef(), cmp);
+ return false;
+ }
+ }
+};
+
+}