diff options
author | Geir Storli <geirst@yahooinc.com> | 2023-09-18 12:31:05 +0000 |
---|---|---|
committer | Geir Storli <geirst@yahooinc.com> | 2023-09-18 12:34:32 +0000 |
commit | e9007c181c796c90d165769fe575fbc790fa96b8 (patch) | |
tree | b1fb4846b7bf5393c46e75122ee04d6bf189b84f /searchlib/src/tests | |
parent | 5e5137cf0ab9cc67452f36b488394458f827b2b0 (diff) |
Add class that uses a LevenshteinDfa to perform fuzzy matching in a dictionary.
The dictionary iterator is advanced based on the successor string from the DFA
each time the candidate word is _not_ a match.
Diffstat (limited to 'searchlib/src/tests')
-rw-r--r-- | searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt | 9 | ||||
-rw-r--r-- | searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp | 245 |
2 files changed, 254 insertions, 0 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt new file mode 100644 index 00000000000..905e7a493da --- /dev/null +++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_attribute_dfa_fuzzy_matcher_test_app TEST + SOURCES + dfa_fuzzy_matcher_test.cpp + DEPENDS + searchlib + GTest::GTest +) +vespa_add_test(NAME searchlib_attribute_dfa_fuzzy_matcher_test_app COMMAND searchlib_attribute_dfa_fuzzy_matcher_test_app) diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp new file mode 100644 index 00000000000..c7bd0e917f3 --- /dev/null +++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp @@ -0,0 +1,245 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/common/dictionary_config.h> +#include <vespa/searchlib/attribute/dfa_fuzzy_matcher.h> +#include <vespa/searchlib/attribute/enumstore.h> +#include <vespa/searchlib/attribute/i_enum_store_dictionary.h> +#include <vespa/vespalib/fuzzy/fuzzy_matcher.h> +#include <vespa/vespalib/fuzzy/levenshtein_dfa.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <vespa/vespalib/util/time.h> +#include <filesystem> +#include <fstream> +#include <iostream> + +namespace fs = std::filesystem; + +static std::string benchmark_dictionary; +static size_t dfa_words_to_match = 1000; +static size_t brute_force_words_to_match = 0; + +bool benchmarking_enabled() { + return !benchmark_dictionary.empty(); +} + +using namespace search::attribute; +using namespace search; +using vespalib::FuzzyMatcher; +using vespalib::datastore::AtomicEntryRef; +using vespalib::fuzzy::LevenshteinDfa; + +using StringEnumStore = EnumStoreT<const char*>; +using DictionaryEntry = std::pair<std::string, size_t>; +using RawDictionary = std::vector<DictionaryEntry>; +using StringVector = std::vector<std::string>; + +RawDictionary +read_dictionary() +{ + RawDictionary result; + std::ifstream file(benchmark_dictionary); + if (!file.is_open()) { + std::cerr << "Could not open benchmark dictionary file '" << benchmark_dictionary << "'" << std::endl; + return result; + } + std::string line; + std::string word; + size_t freq; + + /** + * Each line in the dictionary file should be on this format: + * word\tfrequency\n + * + * This is the same format used when dumping a disk index dictionary using 'vespa-index-inspect dumpwords'. + * See https://docs.vespa.ai/en/reference/vespa-cmdline-tools.html#vespa-index-inspect. + */ + while (std::getline(file, line)) { + std::istringstream iss(line); + if (std::getline(iss, word, '\t') && iss >> freq) { + result.emplace_back(word, freq); + } else { + std::cerr << "Invalid line: '" << line << "'" << std::endl; + } + } + file.close(); + return result; +} + +StringVector +to_string_vector(const RawDictionary& dict) +{ + StringVector result; + for (const auto& entry : dict) { + result.push_back(entry.first); + } + return result; +} + +void +sort_by_freq(RawDictionary& dict) +{ + std::sort(dict.begin(), dict.end(), + [](const DictionaryEntry& lhs, const DictionaryEntry& rhs) { + return lhs.second > rhs.second; + }); +} + +struct MatchStats { + size_t matches; + size_t seeks; + vespalib::duration elapsed; + size_t samples; + MatchStats() : matches(0), seeks(0), elapsed(0), samples(0) {} + void add_sample(size_t matches_in, size_t seeks_in, vespalib::duration elapsed_in) { + matches += matches_in; + seeks += seeks_in; + elapsed += elapsed_in; + ++samples; + } + double avg_matches() const { + return (double) matches / samples; + } + double avg_seeks() const { + return (double) seeks / samples; + } + double avg_elapsed_ms() const { + return (double) vespalib::count_ms(elapsed) / samples; + } +}; + +template <bool collect_matches> +void +brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words) +{ + auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); + vespalib::Timer timer; + FuzzyMatcher matcher(target, 2, 0, false); + auto itr = view.begin(); + size_t matches = 0; + size_t seeks = 0; + while (itr.valid()) { + auto word = store.get_value(itr.getKey().load_relaxed()); + if (matcher.isMatch(word)) { + ++matches; + if (collect_matches) { + matched_words.push_back(word); + } + } + ++seeks; + ++itr; + } + stats.add_sample(matches, seeks, timer.elapsed()); +} + +template <bool collect_matches> +void +dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words) +{ + auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); + vespalib::Timer timer; + DfaFuzzyMatcher matcher(target, 2, false, LevenshteinDfa::DfaType::Explicit); + auto itr = view.begin(); + size_t matches = 0; + size_t seeks = 0; + while (itr.valid()) { + auto word = store.get_value(itr.getKey().load_relaxed()); + if (matcher.is_match(word, itr, store.get_data_store())) { + ++itr; + ++matches; + if (collect_matches) { + matched_words.push_back(word); + } + } else { + ++seeks; + } + } + stats.add_sample(matches, seeks, timer.elapsed()); +} + +struct DfaFuzzyMatcherTest : public ::testing::Test { + StringEnumStore store; + DfaFuzzyMatcherTest() + : store(true, DictionaryConfig(DictionaryConfig::Type::BTREE, DictionaryConfig::Match::UNCASED)) + {} + void populate_dictionary(const StringVector& words) { + auto updater = store.make_batch_updater(); + for (const auto& word : words) { + auto ref = updater.insert(word.c_str()); + updater.inc_ref_count(ref); + } + updater.commit(); + store.freeze_dictionary(); + } + void expect_matches(std::string_view target, const StringVector& exp_matches) { + MatchStats stats; + StringVector brute_force_matches; + StringVector dfa_matches; + brute_force_fuzzy_match_in_dictionary<true>(target, store, stats, brute_force_matches); + dfa_fuzzy_match_in_dictionary<true>(target, store, stats, dfa_matches); + EXPECT_EQ(exp_matches, brute_force_matches); + EXPECT_EQ(exp_matches, dfa_matches); + } +}; + +TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) +{ + StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill", + "for", "forbid", "force", "ford", "forearm", "forecast", "forest" }; + populate_dictionary(words); + expect_matches("board", {"board", "boat", "ford"}); + expect_matches("food", {"door", "food", "foot", "for", "ford"}); + expect_matches("foothill", {"football", "foothill"}); + expect_matches("for", {"bob", "door", "food", "foot", "for", "force", "ford"}); + expect_matches("force", {"for", "force", "ford"}); + expect_matches("forcecast", {"forecast"}); +} + +void +benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool dfa_algorithm) +{ + MatchStats stats; + StringVector dummy; + for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) { + const auto& entry = dict[i]; + if (dfa_algorithm) { + dfa_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy); + } else { + brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy); + } + } + std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl; +} + +TEST_F(DfaFuzzyMatcherTest, benchmark_fuzzy_match_in_dictionary) +{ + if (!benchmarking_enabled()) { + GTEST_SKIP() << "benchmarking not enabled"; + } + auto dict = read_dictionary(); + populate_dictionary(to_string_vector(dict)); + std::cout << "Unique words: " << store.get_num_uniques() << std::endl; + sort_by_freq(dict); + benchmark_fuzzy_match_in_dictionary(store, dict, dfa_words_to_match, true); + benchmark_fuzzy_match_in_dictionary(store, dict, brute_force_words_to_match, false); +} + +int +main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + if (argc > 1) { + benchmark_dictionary = argv[1]; + if (!fs::exists(fs::path(benchmark_dictionary))) { + std::cerr << "Benchmark dictionary file '" << benchmark_dictionary << "' does not exist" << std::endl; + return 1; + } + if (argc > 2) { + dfa_words_to_match = std::stoi(argv[2]); + } + if (argc > 3) { + brute_force_words_to_match = std::stoi(argv[3]); + } + } + return RUN_ALL_TESTS(); +} + |