diff options
author | Geir Storli <geirst@yahooinc.com> | 2022-04-11 11:26:02 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-11 11:26:02 +0200 |
commit | 65dc21685f2286a30c82c7d14c9fe5fe5c42d412 (patch) | |
tree | c28d2a5dc1bff4dab1051c163042b84899d2bb2c /vespalib | |
parent | 23841f2517967c1a59cf9826f1de953c5caa7199 (diff) | |
parent | 16aaf73dc37c63fa92a3298e6c9f8fa6ed32422a (diff) |
Merge pull request #21972 from alexeyche/alexeyche/fuzzy-query-annotations
Propagating annotations for fuzzy query [WIP]
Diffstat (limited to 'vespalib')
5 files changed, 104 insertions, 32 deletions
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp index 60a4eab3f57..1395915fa5f 100644 --- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp +++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp @@ -6,12 +6,34 @@ using namespace vespalib; -FuzzyMatcher from_term(std::string_view term, uint8_t threshold, uint8_t prefix_size) { - return {LowerCase::convert_to_ucs4(term), threshold, prefix_size}; + +template <typename T> +void assert_span(std::span<const T> left, std::vector<T> right) { + EXPECT_TRUE(std::equal(left.begin(), left.end(), right.begin(), right.end())); +} + +TEST(FuzzyMatcherTest, get_prefix_edge_cases) { + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 0), {}); + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 1), {1 }); + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 2), {1, 2}); + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 3), {1, 2, 3}); + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 10),{1, 2, 3}); + assert_span(FuzzyMatcher::get_prefix({}, 0), {}); + assert_span(FuzzyMatcher::get_prefix({}, 10), {}); +} + +TEST(FuzzyMatcherTest, get_suffix_edge_cases) { + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 0), {1, 2, 3}); + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 1), {2, 3}); + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 2), {3}); + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 3), {}); + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 10), {}); + assert_span(FuzzyMatcher::get_suffix({}, 0), {}); + assert_span(FuzzyMatcher::get_suffix({}, 10),{}); } TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { - FuzzyMatcher fuzzy = from_term("abc", 2, 0); + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abc", 2, 0); EXPECT_TRUE(fuzzy.isMatch("abc")); EXPECT_TRUE(fuzzy.isMatch("ABC")); EXPECT_TRUE(fuzzy.isMatch("ab1")); @@ -20,22 +42,30 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { } TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { - FuzzyMatcher fuzzy = from_term("abcdef", 2, 2); + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcdef", 2, 2); EXPECT_TRUE(fuzzy.isMatch("abcdef")); EXPECT_TRUE(fuzzy.isMatch("ABCDEF")); EXPECT_TRUE(fuzzy.isMatch("abcde1")); EXPECT_TRUE(fuzzy.isMatch("abcd12")); EXPECT_FALSE(fuzzy.isMatch("abc123")); - EXPECT_TRUE(fuzzy.isMatch("12cdef")); // prefix match is not enforced + EXPECT_FALSE(fuzzy.isMatch("12cdef")); } TEST(FuzzyMatcherTest, get_prefix_is_empty) { - FuzzyMatcher fuzzy = from_term("whatever", 2, 0); + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("whatever", 2, 0); EXPECT_EQ(fuzzy.getPrefix(), ""); } +TEST(FuzzyMatcherTest, term_is_empty) { + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("", 2, 0); + EXPECT_TRUE(fuzzy.isMatch("")); + EXPECT_TRUE(fuzzy.isMatch("a")); + EXPECT_TRUE(fuzzy.isMatch("aa")); + EXPECT_FALSE(fuzzy.isMatch("aaa")); +} + TEST(FuzzyMatcherTest, get_prefix_non_empty) { - FuzzyMatcher fuzzy = from_term("abcd", 2, 2); + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcd", 2, 2); EXPECT_EQ(fuzzy.getPrefix(), "ab"); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp index e170f83c541..21fdb86a48d 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp @@ -6,24 +6,51 @@ #include <vespa/vespalib/text/lowercase.h> #include <vespa/vespalib/text/utf8.h> -vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term) { - return FuzzyMatcher(LowerCase::convert_to_ucs4(term)); +vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength) { + return FuzzyMatcher(LowerCase::convert_to_ucs4(term), maxEditDistance, prefixLength); +} + +std::span<const uint32_t> vespalib::FuzzyMatcher::get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) { + if (prefixLength == 0 || termCodepoints.empty()) { + return {}; + } else { + uint32_t actualPrefixLength = std::min(prefixLength, static_cast<uint32_t>(termCodepoints.size())); + return {termCodepoints.begin(), termCodepoints.begin() + actualPrefixLength}; + } +} + +std::span<const uint32_t> vespalib::FuzzyMatcher::get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) { + if (termCodepoints.empty()) { + return {}; + } else { + uint32_t actualPrefixLength = std::min(prefixLength, static_cast<uint32_t>(termCodepoints.size())); + return {termCodepoints.begin() + actualPrefixLength, termCodepoints.end()}; + } } bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const { std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target); + if (_prefix_size > 0) { // prefix comparison is meaningless if it's empty + std::span<const uint32_t> targetPrefix = get_prefix(targetCodepoints, _prefix_size); + // if prefix does not match, early stop + if (!std::equal(_folded_term_codepoints_prefix.begin(), _folded_term_codepoints_prefix.end(), + targetPrefix.begin(), targetPrefix.end())) { + return false; + } + } + return LevenshteinDistance::calculate( - _folded_term_codepoints, - targetCodepoints, + _folded_term_codepoints_suffix, + get_suffix(targetCodepoints, _prefix_size), _max_edit_distance).has_value(); } vespalib::string vespalib::FuzzyMatcher::getPrefix() const { vespalib::string prefix; Utf8Writer writer(prefix); - for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) { - writer.putChar(_folded_term_codepoints.at(i)); + for (const uint32_t& code: _folded_term_codepoints_prefix) { + writer.putChar(code); } return prefix; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h index d4ee0e6d40f..12cc039706f 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h @@ -3,6 +3,7 @@ #include <string_view> #include <vector> +#include <span> #include <vespa/vespalib/stllike/string.h> @@ -13,37 +14,46 @@ namespace vespalib { * Class has two main parameters: * - prefix size, i.e the size of the prefix that is considered frozen. * - max edit distance, i.e an upper bound for edit distance for it to be a match between terms - * Note that prefix size is not impacting matching logic, - * it's expected to be used to generate prefix for the lookup in the BTree for the fast-search logic. - * But there is no code to actually enforcing it. + * Prefix size dictates of how match of a prefix is frozen, + * i.e. if prefixes between the document and the query do not match (after lowercase) + * matcher would return false early, without fuzzy match. */ class FuzzyMatcher { private: - static constexpr uint8_t DefaultPrefixSize = 0u; - static constexpr uint8_t DefaultMaxEditDistance = 2u; + static constexpr uint32_t DefaultPrefixSize = 0u; + static constexpr uint32_t DefaultMaxEditDistance = 2u; + + uint32_t _max_edit_distance; // max edit distance + uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy std::vector<uint32_t> _folded_term_codepoints; - uint8_t _max_edit_distance; // max edit distance - uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy + std::span<const uint32_t> _folded_term_codepoints_prefix; + std::span<const uint32_t> _folded_term_codepoints_suffix; public: FuzzyMatcher(): - _folded_term_codepoints(), _max_edit_distance(DefaultMaxEditDistance), - _prefix_size(DefaultPrefixSize) + _prefix_size(DefaultPrefixSize), + _folded_term_codepoints(), + _folded_term_codepoints_prefix(), + _folded_term_codepoints_suffix() {} - FuzzyMatcher(std::vector<uint32_t> codepoints): - _folded_term_codepoints(std::move(codepoints)), + FuzzyMatcher(const std::vector<uint32_t>&& codepoints): _max_edit_distance(DefaultMaxEditDistance), - _prefix_size(DefaultPrefixSize) + _prefix_size(DefaultPrefixSize), + _folded_term_codepoints(codepoints), + _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)), + _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size)) {} - FuzzyMatcher(std::vector<uint32_t> codepoints, uint8_t max_edit_distance, uint8_t prefix_size): - _folded_term_codepoints(std::move(codepoints)), + FuzzyMatcher(const std::vector<uint32_t>&& codepoints, uint32_t max_edit_distance, uint32_t prefix_size): _max_edit_distance(max_edit_distance), - _prefix_size(prefix_size) + _prefix_size(prefix_size), + _folded_term_codepoints(codepoints), + _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)), + _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size)) {} [[nodiscard]] bool isMatch(std::string_view target) const; @@ -52,7 +62,11 @@ public: /// - static FuzzyMatcher from_term(std::string_view term); + static FuzzyMatcher from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength); + + static std::span<const uint32_t> get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength); + + static std::span<const uint32_t> get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength); }; diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp index 86ced3e52db..5f97a652f52 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp @@ -4,9 +4,10 @@ #include "levenshtein_distance.h" #include <limits> +#include <vector> std::optional<uint32_t> -vespalib::LevenshteinDistance::calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) +vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold) { uint32_t n = left.size(); uint32_t m = right.size(); diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h index 311211aba7a..c1a3435db44 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h @@ -3,7 +3,7 @@ #include <optional> #include <cstdint> -#include <vector> +#include <span> namespace vespalib { @@ -19,7 +19,7 @@ namespace vespalib { */ class LevenshteinDistance { public: - static std::optional<uint32_t> calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold); + static std::optional<uint32_t> calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold); }; } |