aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2023-09-25 11:35:44 +0200
committerGitHub <noreply@github.com>2023-09-25 11:35:44 +0200
commit840fd17d00103a880a8d87e82b9f28bc228d1336 (patch)
tree8ddb62facbebc83e6fac274fcb0722323d8f1319 /searchlib/src
parent4d5c9f2d3033a4e51b916dcde569f8f0451bcee0 (diff)
parent60eb4d43fce41b252f9e16cf951a009f1d64f0ed (diff)
Merge pull request #28624 from vespa-engine/toregge/add-prefix-size-constructor-argument-to-dfa-fuzzy-matcher
Add prefix_size constructor argument to DfaFuzzyMatcher.
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp61
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp59
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h29
3 files changed, 127 insertions, 22 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
index c7bd0e917f3..77e23a58163 100644
--- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -26,6 +26,7 @@ using namespace search::attribute;
using namespace search;
using vespalib::FuzzyMatcher;
using vespalib::datastore::AtomicEntryRef;
+using vespalib::datastore::EntryRef;
using vespalib::fuzzy::LevenshteinDfa;
using StringEnumStore = EnumStoreT<const char*>;
@@ -109,11 +110,11 @@ struct MatchStats {
template <bool collect_matches>
void
-brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words)
+brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- FuzzyMatcher matcher(target, 2, 0, false);
+ FuzzyMatcher matcher(target, 2, prefix_size, false);
auto itr = view.begin();
size_t matches = 0;
size_t seeks = 0;
@@ -133,15 +134,27 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS
template <bool collect_matches>
void
-dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, MatchStats& stats, StringVector& matched_words)
+dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- DfaFuzzyMatcher matcher(target, 2, false, LevenshteinDfa::DfaType::Explicit);
- auto itr = view.begin();
+ DfaFuzzyMatcher matcher(target, 2, prefix_size, false, LevenshteinDfa::DfaType::Explicit);
+ std::string target_copy(target.substr(0, prefix_size));
+ auto prefix_cmp = store.make_folded_comparator_prefix(target_copy.c_str());
+ auto itr = prefix_size > 0 ? view.lowerBound(AtomicEntryRef(), prefix_cmp) : view.begin();
+ auto itr_end = itr;
+ if (itr_end.valid()) {
+ if (prefix_size > 0) {
+ if (!prefix_cmp.less(EntryRef(), itr_end.getKey().load_relaxed())) {
+ itr_end.seekPast(AtomicEntryRef(), prefix_cmp);
+ }
+ } else {
+ itr_end.end();
+ }
+ }
size_t matches = 0;
size_t seeks = 0;
- while (itr.valid()) {
+ while (itr != itr_end) {
auto word = store.get_value(itr.getKey().load_relaxed());
if (matcher.is_match(word, itr, store.get_data_store())) {
++itr;
@@ -170,15 +183,19 @@ struct DfaFuzzyMatcherTest : public ::testing::Test {
updater.commit();
store.freeze_dictionary();
}
- void expect_matches(std::string_view target, const StringVector& exp_matches) {
+ void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) {
MatchStats stats;
StringVector brute_force_matches;
StringVector dfa_matches;
- brute_force_fuzzy_match_in_dictionary<true>(target, store, stats, brute_force_matches);
- dfa_fuzzy_match_in_dictionary<true>(target, store, stats, dfa_matches);
+ SCOPED_TRACE(target);
+ brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, brute_force_matches);
+ dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, stats, dfa_matches);
EXPECT_EQ(exp_matches, brute_force_matches);
EXPECT_EQ(exp_matches, dfa_matches);
}
+ void expect_matches(std::string_view target, const StringVector& exp_matches) {
+ expect_prefix_matches(target, 0, exp_matches);
+ }
};
TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
@@ -194,6 +211,28 @@ TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
expect_matches("forcecast", {"forecast"});
}
+TEST_F(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
+{
+ StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
+ "for", "forbid", "force", "ford", "forearm", "forecast", "forest" };
+ populate_dictionary(words);
+ expect_prefix_matches("a", 1, {});
+ expect_prefix_matches("b", 1, {"bob"});
+ expect_prefix_matches("board", 1, {"board", "boat"});
+ expect_prefix_matches("c", 1, {});
+ expect_prefix_matches("food", 1, {"food", "foot", "for", "ford"});
+ expect_prefix_matches("food", 2, {"food", "foot", "for", "ford"});
+ expect_prefix_matches("food", 3, {"food", "foot"});
+ expect_prefix_matches("foothill", 1, {"football", "foothill"});
+ expect_prefix_matches("for", 1, {"food", "foot", "for", "force", "ford"});
+ expect_prefix_matches("for", 2, {"food", "foot", "for", "force", "ford"});
+ expect_prefix_matches("for", 3, {"for", "force", "ford"});
+ expect_prefix_matches("force", 1, {"for", "force", "ford"});
+ expect_prefix_matches("forcecast", 1, {"forecast"});
+ expect_prefix_matches("forcecast", 4, {});
+ expect_prefix_matches("z", 1, {});
+}
+
void
benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool dfa_algorithm)
{
@@ -202,9 +241,9 @@ benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDicti
for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) {
const auto& entry = dict[i];
if (dfa_algorithm) {
- dfa_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy);
+ dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy);
} else {
- brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, stats, dummy);
+ brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, stats, dummy);
}
}
std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl;
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
index 580c34bd5d0..f66dd56c72f 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
@@ -1,17 +1,70 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "dfa_fuzzy_matcher.h"
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/text/lowercase.h>
using vespalib::fuzzy::LevenshteinDfa;
+using vespalib::LowerCase;
+using vespalib::Utf8Reader;
+using vespalib::Utf8ReaderForZTS;
namespace search::attribute {
-DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, LevenshteinDfa::DfaType dfa_type)
- : _dfa(vespalib::fuzzy::LevenshteinDfa::build(target, max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)),
- _successor()
+namespace {
+
+std::vector<uint32_t>
+extract_prefix(std::string_view target, uint32_t prefix_size, bool cased)
+{
+ std::vector<uint32_t> result;
+ result.reserve(prefix_size);
+ Utf8Reader reader(vespalib::stringref(target.data(), target.size()));
+ for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) {
+ uint32_t code_point = reader.getChar();
+ if (!cased) {
+ code_point = LowerCase::convert(code_point);
+ }
+ result.emplace_back(code_point);
+ }
+ return result;
+}
+
+std::string_view
+extract_suffix(std::string_view target, uint32_t prefix_size)
+{
+ Utf8Reader reader(vespalib::stringref(target.data(), target.size()));
+ for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) {
+ (void) reader.getChar();
+ }
+ std::string_view result = target;
+ result.remove_prefix(reader.getPos());
+ return result;
+}
+
+}
+
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, LevenshteinDfa::DfaType dfa_type)
+ : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)),
+ _successor(),
+ _prefix(extract_prefix(target, prefix_size, cased)),
+ _successor_suffix(),
+ _prefix_size(prefix_size)
{
+ _successor = _prefix;
}
DfaFuzzyMatcher::~DfaFuzzyMatcher() = default;
+const char*
+DfaFuzzyMatcher::skip_prefix(const char* word) const
+{
+ Utf8ReaderForZTS reader(word);
+ size_t pos = 0;
+ for (; pos < _prefix_size && reader.hasMore(); ++pos) {
+ (void) reader.getChar();
+ }
+ assert(pos == _prefix_size);
+ return reader.get_current_ptr();
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
index fcba13f85a4..3d24948e044 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -17,22 +17,35 @@ namespace search::attribute {
class DfaFuzzyMatcher {
private:
vespalib::fuzzy::LevenshteinDfa _dfa;
- std::vector<uint32_t> _successor;
+ std::vector<uint32_t> _successor;
+ std::vector<uint32_t> _prefix;
+ std::vector<uint32_t> _successor_suffix;
+ uint32_t _prefix_size;
+ const char*skip_prefix(const char* word) const;
public:
- DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
+ DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
~DfaFuzzyMatcher();
template <typename DictionaryConstIteratorType>
bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) {
- auto match = _dfa.match(word, _successor);
- if (match.matches()) {
- return true;
+ if (_prefix_size > 0) {
+ word = skip_prefix(word);
+ auto match = _dfa.match(word, _successor_suffix);
+ if (match.matches()) {
+ return true;
+ }
+ _successor.resize(_prefix.size());
+ _successor.insert(_successor.end(), _successor_suffix.begin(), _successor_suffix.end());
} else {
- DfaStringComparator cmp(data_store, _successor);
- itr.seek(vespalib::datastore::AtomicEntryRef(), cmp);
- return false;
+ auto match = _dfa.match(word, _successor);
+ if (match.matches()) {
+ return true;
+ }
}
+ DfaStringComparator cmp(data_store, _successor);
+ itr.seek(vespalib::datastore::AtomicEntryRef(), cmp);
+ return false;
}
};