summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorAlexey Chernyshev <aleksei@spotify.com>2022-04-04 16:23:07 +0200
committerAlexey Chernyshev <aleksei@spotify.com>2022-04-07 14:44:30 +0200
commit7e9b33401201db9a9e22971dd419247e268bbfaa (patch)
treef5032a82e9fa74247b2fdeb3dcde4dc6cf98ce89 /vespalib
parentad7cc1d11f0c19baa2344a643377576c559555f7 (diff)
Propagating annotations for fuzzy query
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp44
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp39
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h46
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp3
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h4
5 files changed, 104 insertions, 32 deletions
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
index 60a4eab3f57..1395915fa5f 100644
--- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -6,12 +6,34 @@
using namespace vespalib;
-FuzzyMatcher from_term(std::string_view term, uint8_t threshold, uint8_t prefix_size) {
- return {LowerCase::convert_to_ucs4(term), threshold, prefix_size};
+
+template <typename T>
+void assert_span(std::span<const T> left, std::vector<T> right) {
+ EXPECT_TRUE(std::equal(left.begin(), left.end(), right.begin(), right.end()));
+}
+
+TEST(FuzzyMatcherTest, get_prefix_edge_cases) {
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 0), {});
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 1), {1 });
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 2), {1, 2});
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 3), {1, 2, 3});
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 10),{1, 2, 3});
+ assert_span(FuzzyMatcher::get_prefix({}, 0), {});
+ assert_span(FuzzyMatcher::get_prefix({}, 10), {});
+}
+
+TEST(FuzzyMatcherTest, get_suffix_edge_cases) {
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 0), {1, 2, 3});
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 1), {2, 3});
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 2), {3});
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 3), {});
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 10), {});
+ assert_span(FuzzyMatcher::get_suffix({}, 0), {});
+ assert_span(FuzzyMatcher::get_suffix({}, 10),{});
}
TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
- FuzzyMatcher fuzzy = from_term("abc", 2, 0);
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abc", 2, 0);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("ABC"));
EXPECT_TRUE(fuzzy.isMatch("ab1"));
@@ -20,22 +42,30 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
}
TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
- FuzzyMatcher fuzzy = from_term("abcdef", 2, 2);
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcdef", 2, 2);
EXPECT_TRUE(fuzzy.isMatch("abcdef"));
EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
EXPECT_TRUE(fuzzy.isMatch("abcde1"));
EXPECT_TRUE(fuzzy.isMatch("abcd12"));
EXPECT_FALSE(fuzzy.isMatch("abc123"));
- EXPECT_TRUE(fuzzy.isMatch("12cdef")); // prefix match is not enforced
+ EXPECT_FALSE(fuzzy.isMatch("12cdef"));
}
TEST(FuzzyMatcherTest, get_prefix_is_empty) {
- FuzzyMatcher fuzzy = from_term("whatever", 2, 0);
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("whatever", 2, 0);
EXPECT_EQ(fuzzy.getPrefix(), "");
}
+TEST(FuzzyMatcherTest, term_is_empty) {
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("", 2, 0);
+ EXPECT_TRUE(fuzzy.isMatch(""));
+ EXPECT_TRUE(fuzzy.isMatch("a"));
+ EXPECT_TRUE(fuzzy.isMatch("aa"));
+ EXPECT_FALSE(fuzzy.isMatch("aaa"));
+}
+
TEST(FuzzyMatcherTest, get_prefix_non_empty) {
- FuzzyMatcher fuzzy = from_term("abcd", 2, 2);
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcd", 2, 2);
EXPECT_EQ(fuzzy.getPrefix(), "ab");
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
index e170f83c541..21fdb86a48d 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
@@ -6,24 +6,51 @@
#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
-vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term) {
- return FuzzyMatcher(LowerCase::convert_to_ucs4(term));
+vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength) {
+ return FuzzyMatcher(LowerCase::convert_to_ucs4(term), maxEditDistance, prefixLength);
+}
+
+std::span<const uint32_t> vespalib::FuzzyMatcher::get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) {
+ if (prefixLength == 0 || termCodepoints.empty()) {
+ return {};
+ } else {
+ uint32_t actualPrefixLength = std::min(prefixLength, static_cast<uint32_t>(termCodepoints.size()));
+ return {termCodepoints.begin(), termCodepoints.begin() + actualPrefixLength};
+ }
+}
+
+std::span<const uint32_t> vespalib::FuzzyMatcher::get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) {
+ if (termCodepoints.empty()) {
+ return {};
+ } else {
+ uint32_t actualPrefixLength = std::min(prefixLength, static_cast<uint32_t>(termCodepoints.size()));
+ return {termCodepoints.begin() + actualPrefixLength, termCodepoints.end()};
+ }
}
bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const {
std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target);
+ if (_prefix_size > 0) { // prefix comparison is meaningless if it's empty
+ std::span<const uint32_t> targetPrefix = get_prefix(targetCodepoints, _prefix_size);
+ // if prefix does not match, early stop
+ if (!std::equal(_folded_term_codepoints_prefix.begin(), _folded_term_codepoints_prefix.end(),
+ targetPrefix.begin(), targetPrefix.end())) {
+ return false;
+ }
+ }
+
return LevenshteinDistance::calculate(
- _folded_term_codepoints,
- targetCodepoints,
+ _folded_term_codepoints_suffix,
+ get_suffix(targetCodepoints, _prefix_size),
_max_edit_distance).has_value();
}
vespalib::string vespalib::FuzzyMatcher::getPrefix() const {
vespalib::string prefix;
Utf8Writer writer(prefix);
- for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) {
- writer.putChar(_folded_term_codepoints.at(i));
+ for (const uint32_t& code: _folded_term_codepoints_prefix) {
+ writer.putChar(code);
}
return prefix;
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
index d4ee0e6d40f..12cc039706f 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
@@ -3,6 +3,7 @@
#include <string_view>
#include <vector>
+#include <span>
#include <vespa/vespalib/stllike/string.h>
@@ -13,37 +14,46 @@ namespace vespalib {
* Class has two main parameters:
* - prefix size, i.e the size of the prefix that is considered frozen.
* - max edit distance, i.e an upper bound for edit distance for it to be a match between terms
- * Note that prefix size is not impacting matching logic,
- * it's expected to be used to generate prefix for the lookup in the BTree for the fast-search logic.
- * But there is no code to actually enforcing it.
+ * Prefix size dictates of how match of a prefix is frozen,
+ * i.e. if prefixes between the document and the query do not match (after lowercase)
+ * matcher would return false early, without fuzzy match.
*/
class FuzzyMatcher {
private:
- static constexpr uint8_t DefaultPrefixSize = 0u;
- static constexpr uint8_t DefaultMaxEditDistance = 2u;
+ static constexpr uint32_t DefaultPrefixSize = 0u;
+ static constexpr uint32_t DefaultMaxEditDistance = 2u;
+
+ uint32_t _max_edit_distance; // max edit distance
+ uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
std::vector<uint32_t> _folded_term_codepoints;
- uint8_t _max_edit_distance; // max edit distance
- uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
+ std::span<const uint32_t> _folded_term_codepoints_prefix;
+ std::span<const uint32_t> _folded_term_codepoints_suffix;
public:
FuzzyMatcher():
- _folded_term_codepoints(),
_max_edit_distance(DefaultMaxEditDistance),
- _prefix_size(DefaultPrefixSize)
+ _prefix_size(DefaultPrefixSize),
+ _folded_term_codepoints(),
+ _folded_term_codepoints_prefix(),
+ _folded_term_codepoints_suffix()
{}
- FuzzyMatcher(std::vector<uint32_t> codepoints):
- _folded_term_codepoints(std::move(codepoints)),
+ FuzzyMatcher(const std::vector<uint32_t>&& codepoints):
_max_edit_distance(DefaultMaxEditDistance),
- _prefix_size(DefaultPrefixSize)
+ _prefix_size(DefaultPrefixSize),
+ _folded_term_codepoints(codepoints),
+ _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
+ _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
{}
- FuzzyMatcher(std::vector<uint32_t> codepoints, uint8_t max_edit_distance, uint8_t prefix_size):
- _folded_term_codepoints(std::move(codepoints)),
+ FuzzyMatcher(const std::vector<uint32_t>&& codepoints, uint32_t max_edit_distance, uint32_t prefix_size):
_max_edit_distance(max_edit_distance),
- _prefix_size(prefix_size)
+ _prefix_size(prefix_size),
+ _folded_term_codepoints(codepoints),
+ _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
+ _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
{}
[[nodiscard]] bool isMatch(std::string_view target) const;
@@ -52,7 +62,11 @@ public:
///
- static FuzzyMatcher from_term(std::string_view term);
+ static FuzzyMatcher from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength);
+
+ static std::span<const uint32_t> get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);
+
+ static std::span<const uint32_t> get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);
};
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
index 86ced3e52db..5f97a652f52 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
@@ -4,9 +4,10 @@
#include "levenshtein_distance.h"
#include <limits>
+#include <vector>
std::optional<uint32_t>
-vespalib::LevenshteinDistance::calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold)
+vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold)
{
uint32_t n = left.size();
uint32_t m = right.size();
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
index 311211aba7a..c1a3435db44 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
@@ -3,7 +3,7 @@
#include <optional>
#include <cstdint>
-#include <vector>
+#include <span>
namespace vespalib {
@@ -19,7 +19,7 @@ namespace vespalib {
*/
class LevenshteinDistance {
public:
- static std::optional<uint32_t> calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold);
+ static std::optional<uint32_t> calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold);
};
}