summaryrefslogtreecommitdiffstats
path: root/searchlib/src/tests/attribute/dfa_fuzzy_matcher
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-04-19 11:19:18 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-04-19 13:45:59 +0000
commit0afbf14df1ee158167f70016545e799af1e433dc (patch)
tree5af98617f61cb76fcfe897585c9c2712955de3b9 /searchlib/src/tests/attribute/dfa_fuzzy_matcher
parent433cb01e19f6bb51d6a2d029482a6e16431cb055 (diff)
Wire fuzzy prefix matching support through the query stack
Adds `prefix:[true|false]` annotation support to the `fuzzy` query operator in the YQL and JSON query languages. Fuzzy prefix matching semantics are wired through to the matcher implementations for both indexed and streaming search. Example usage: {maxEditDistance:1,prefix:true}fuzzy("foo") Will match `foo`, `foobar`, `foxtrot`, `zookeeper` and so on. It can be combined with the existing prefix locking feature: {maxEditDistance:1,prefixLength:2,prefix:true}fuzzy("foo") Which will match `foo`, `foobar`, `foxtrot` etc, but _not_ `zookeeper` since the locked prefix (`fo`) does not match. Due to the complexities involved with extending the legacy binary query stack representation, signalling prefix matching for the fuzzy term is done by pragmatically adding a new, generic "prefix matching" term-level flag. This is currently ignored for everything except fuzzy query items. Modernizing the query stack format to make it more extensible (i.e. move encoding to Protobuf) is on the backlog...!
Diffstat (limited to 'searchlib/src/tests/attribute/dfa_fuzzy_matcher')
-rw-r--r--searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp54
1 files changed, 40 insertions, 14 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
index 8ba8c62c5ff..55810c02550 100644
--- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -120,11 +120,12 @@ struct MatchStats {
template <bool collect_matches>
void
-brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- FuzzyMatcher matcher(target, 2, prefix_size, cased);
+ FuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match);
auto itr = view.begin();
size_t matches = 0;
size_t seeks = 0;
@@ -144,11 +145,12 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS
template <bool collect_matches>
void
-dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit);
+ DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit);
Utf8Reader reader(vespalib::stringref(target.data(), target.size()));
std::string target_copy;
Utf8Writer<std::string> writer(target_copy);
@@ -187,11 +189,12 @@ dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& st
template <bool collect_matches>
void
-dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit);
+ DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit);
auto itr = view.begin();
size_t matches = 0;
size_t seeks = 0;
@@ -247,20 +250,23 @@ struct DfaFuzzyMatcherTest : public ::testing::TestWithParam<TestParam> {
updater.commit();
store.freeze_dictionary();
}
- void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) {
+ void expect_prefix_matches(std::string_view target, uint32_t prefix_size, bool prefix_match, const StringVector& exp_matches) {
MatchStats stats;
StringVector brute_force_matches;
StringVector dfa_matches;
StringVector dfa_no_skip_matches;
bool cased = GetParam()._cased;
SCOPED_TRACE(target);
- brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, brute_force_matches);
- dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, dfa_matches);
- dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, stats, dfa_no_skip_matches);
+ brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, brute_force_matches);
+ dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_matches);
+ dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_no_skip_matches);
EXPECT_EQ(exp_matches, brute_force_matches);
EXPECT_EQ(exp_matches, dfa_matches);
EXPECT_EQ(exp_matches, dfa_no_skip_matches);
}
+ void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) {
+ expect_prefix_matches(target, prefix_size, false, exp_matches);
+ }
void expect_matches(std::string_view target, const StringVector& exp_matches) {
expect_prefix_matches(target, 0, exp_matches);
}
@@ -284,11 +290,12 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
expect_matches("forcecast", {"forecast"});
}
-TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
+TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_lock_length)
{
bool cased = GetParam()._cased;
StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
- "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")};
+ "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha",
+ char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")};
populate_dictionary(words);
expect_prefix_matches("a", 1, {});
expect_prefix_matches("b", 1, {"bob"});
@@ -321,6 +328,25 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
}
}
+TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_matching) {
+ // We include the empty string to make "everything matches"-checks easier since
+ // the dictionary implicitly returns such an empty string sentinel already.
+ StringVector words = {"", "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
+ "for", "forbid", "force", "ford", "forearm", "forecast", "forest"};
+ populate_dictionary(words);
+ expect_prefix_matches("bo", 0, true, words); // matches everything
+ expect_prefix_matches("bo", 2, true, {"board", "boat", "bob"});
+ expect_prefix_matches("boar", 0, true, {"board", "boat", "bob", "door", "for", "forbid",
+ "force", "ford", "forearm", "forecast", "forest"});
+ expect_prefix_matches("board", 0, true, {"board", "boat", "ford"});
+ expect_prefix_matches("footb", 0, true, {"food", "foot", "football", "foothill", "forbid"});
+ expect_prefix_matches("footba", 0, true, {"foot", "football", "foothill"});
+ expect_prefix_matches("footbal", 0, true, {"football", "foothill"});
+ expect_prefix_matches("football", 0, true, {"football", "foothill"});
+ expect_prefix_matches("footballs", 0, true, {"football"});
+ expect_prefix_matches("z", 1, true, {});
+}
+
void
benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool cased, bool dfa_algorithm)
{
@@ -329,9 +355,9 @@ benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDicti
for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) {
const auto& entry = dict[i];
if (dfa_algorithm) {
- dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy);
+ dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy);
} else {
- brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy);
+ brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy);
}
}
std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl;