diff options
author | Alexey Chernyshev <aleksei@spotify.com> | 2022-03-22 16:47:14 +0100 |
---|---|---|
committer | Alexey Chernyshev <aleksei@spotify.com> | 2022-03-23 16:21:05 +0100 |
commit | 6bcdc1ac1c1c3ce8b30472926098df989b9f7019 (patch) | |
tree | 1bec251b1711a093a196c25be896c680282abd24 /vespalib | |
parent | 32358e1689cded19a3c5d0213b0ef0c5329c1e33 (diff) |
Addressing more comments
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/src/tests/fuzzy/CMakeLists.txt | 16 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/fuzzy.cpp | 33 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp | 42 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/levenstein_distance_test.cpp | 39 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt | 3 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/fuzzy.h | 60 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp | 29 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h | 59 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp (renamed from vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp) | 48 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h | 15 |
10 files changed, 202 insertions, 142 deletions
diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt index f58602296a5..2a415a9ad62 100644 --- a/vespalib/src/tests/fuzzy/CMakeLists.txt +++ b/vespalib/src/tests/fuzzy/CMakeLists.txt @@ -1,8 +1,18 @@ # Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_add_executable(vespalib_fuzzy_test_app TEST +vespa_add_executable(vespalib_fuzzy_matcher_test_app TEST SOURCES - fuzzy.cpp + fuzzy_matcher_test.cpp DEPENDS vespalib + GTest::GTest ) -vespa_add_test(NAME vespalib_fuzzy_test_app COMMAND vespalib_fuzzy_test_app) +vespa_add_test(NAME vespalib_fuzzy_matcher_test_app COMMAND vespalib_fuzzy_matcher_test_app) + +vespa_add_executable(vespalib_levenstein_distance_test_app TEST + SOURCES + levenstein_distance_test.cpp + DEPENDS + vespalib + GTest::GTest + ) +vespa_add_test(NAME vespalib_levenstein_distance_test_app COMMAND vespalib_levenstein_distance_test_app)
\ No newline at end of file diff --git a/vespalib/src/tests/fuzzy/fuzzy.cpp b/vespalib/src/tests/fuzzy/fuzzy.cpp deleted file mode 100644 index 9ffb77b3742..00000000000 --- a/vespalib/src/tests/fuzzy/fuzzy.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vespalib/testkit/test_kit.h> -#include <vespa/vespalib/fuzzy/fuzzy.h> - -using namespace vespalib; - - -TEST("require that levenstein distance works") { - EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "abc", 2).value()); - EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "ABC", 2).value()); - EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("abc", "abd", 2).value()); - EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("ABC", "abd", 2).value()); - EXPECT_EQUAL(2u, Fuzzy::levenstein_distance("ABC", "add", 2).value()); - EXPECT_FALSE(Fuzzy::levenstein_distance("ABC", "ddd", 2).has_value()); -} - -TEST("require that extracting of a prefix works") { - Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 2, 2); - EXPECT_EQUAL("pr", fuzzy.getPrefix()); -} - -TEST("require that empty prefix works") { - Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 0, 2); - EXPECT_EQUAL("", fuzzy.getPrefix()); -} - -TEST("require that longer prefix size works") { - Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 100, 2); - EXPECT_EQUAL("prefix", fuzzy.getPrefix()); -} - - -TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp new file mode 100644 index 00000000000..60a4eab3f57 --- /dev/null +++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp @@ -0,0 +1,42 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/fuzzy/fuzzy_matcher.h> +#include <vespa/vespalib/text/lowercase.h> +#include <vespa/vespalib/gtest/gtest.h> + +using namespace vespalib; + +FuzzyMatcher from_term(std::string_view term, uint8_t threshold, uint8_t prefix_size) { + return {LowerCase::convert_to_ucs4(term), threshold, prefix_size}; +} + +TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { + FuzzyMatcher fuzzy = from_term("abc", 2, 0); + EXPECT_TRUE(fuzzy.isMatch("abc")); + EXPECT_TRUE(fuzzy.isMatch("ABC")); + EXPECT_TRUE(fuzzy.isMatch("ab1")); + EXPECT_TRUE(fuzzy.isMatch("a12")); + EXPECT_FALSE(fuzzy.isMatch("123")); +} + +TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { + FuzzyMatcher fuzzy = from_term("abcdef", 2, 2); + EXPECT_TRUE(fuzzy.isMatch("abcdef")); + EXPECT_TRUE(fuzzy.isMatch("ABCDEF")); + EXPECT_TRUE(fuzzy.isMatch("abcde1")); + EXPECT_TRUE(fuzzy.isMatch("abcd12")); + EXPECT_FALSE(fuzzy.isMatch("abc123")); + EXPECT_TRUE(fuzzy.isMatch("12cdef")); // prefix match is not enforced +} + +TEST(FuzzyMatcherTest, get_prefix_is_empty) { + FuzzyMatcher fuzzy = from_term("whatever", 2, 0); + EXPECT_EQ(fuzzy.getPrefix(), ""); +} + +TEST(FuzzyMatcherTest, get_prefix_non_empty) { + FuzzyMatcher fuzzy = from_term("abcd", 2, 2); + EXPECT_EQ(fuzzy.getPrefix(), "ab"); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp new file mode 100644 index 00000000000..efdcc82fce1 --- /dev/null +++ b/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp @@ -0,0 +1,39 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/fuzzy/levenstein_distance.h> +#include <vespa/vespalib/text/lowercase.h> +#include <vespa/vespalib/gtest/gtest.h> + +std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) { + std::vector<uint32_t> leftCodepoints = vespalib::LowerCase::convert_to_ucs4(left); + std::vector<uint32_t> rightCodepoints = vespalib::LowerCase::convert_to_ucs4(right); + + std::optional<uint32_t> leftRight = vespalib::LevensteinDistance::calculate(leftCodepoints,rightCodepoints, threshold); + std::optional<uint32_t> rightLeft = vespalib::LevensteinDistance::calculate(rightCodepoints,leftCodepoints, threshold); + + EXPECT_EQ(leftRight, rightLeft); // should be independent whether left or right strings are swapped + + return leftRight; +} + +TEST(LevensteinDistance, calculate_edgecases) { + EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0}); + EXPECT_EQ(calculate("abc", "ab1", 2), std::optional{1}); + EXPECT_EQ(calculate("abc", "1bc", 2), std::optional{1}); + EXPECT_EQ(calculate("abc", "a1c", 2), std::optional{1}); + EXPECT_EQ(calculate("abc", "ab", 2), std::optional{1}); + EXPECT_EQ(calculate("abc", "abcd", 2), std::optional{1}); + EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("cd", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("ad", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "a12", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("a", "", 2), std::optional{1}); + EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "", 2), std::nullopt); + EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); +} + +GTEST_MAIN_RUN_ALL_TESTS() + diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt index 0c6c8f5c446..0f7583da1d7 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt @@ -1,7 +1,8 @@ # Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(vespalib_vespalib_fuzzy OBJECT SOURCES - fuzzy.cpp + fuzzy_matcher.cpp + levenstein_distance.cpp DEPENDS ) diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h deleted file mode 100644 index 775c25445e3..00000000000 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -#include <string_view> -#include <utility> -#include <vector> -#include <optional> - -#include <vespa/vespalib/stllike/string.h> - -namespace vespalib { - -class Fuzzy { -private: - static constexpr uint8_t DefaultPrefixSize = 0u; - static constexpr uint8_t DefaultEditDistance = 2u; - - std::vector<uint32_t> _folded_term_codepoints; - - uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e non-fuzzy - uint8_t _edit_distance; // max edit distance - - -public: - Fuzzy(): - _folded_term_codepoints(), - _prefix_size(DefaultPrefixSize), - _edit_distance(DefaultEditDistance) - {} - - Fuzzy(std::vector<uint32_t> codepoints): - _folded_term_codepoints(std::move(codepoints)), - _prefix_size(DefaultPrefixSize), - _edit_distance(DefaultEditDistance) - {} - - Fuzzy(std::vector<uint32_t> codepoints, uint8_t prefix_size, uint8_t edit_distance): - _folded_term_codepoints(std::move(codepoints)), - _prefix_size(prefix_size), - _edit_distance(edit_distance) - {} - - [[nodiscard]] bool isMatch(std::string_view src) const; - - [[nodiscard]] vespalib::string getPrefix() const; - - /// - - static Fuzzy from_term(std::string_view term); - - static std::optional<uint32_t> levenstein_distance(const std::vector<uint32_t>& source, const std::vector<uint32_t>& target, uint32_t threshold); - - static std::optional<uint32_t> levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold); - - static std::vector<uint32_t> folded_codepoints(const char* src, size_t srcSize); - - static std::vector<uint32_t> folded_codepoints(std::string_view src); - -}; - -} diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp new file mode 100644 index 00000000000..495e08c42e3 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp @@ -0,0 +1,29 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "fuzzy_matcher.h" +#include "levenstein_distance.h" + +#include <vespa/vespalib/text/lowercase.h> +#include <vespa/vespalib/text/utf8.h> + +vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term) { + return FuzzyMatcher(LowerCase::convert_to_ucs4(term)); +} + +bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const { + std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target); + + return LevensteinDistance::calculate( + _folded_term_codepoints, + targetCodepoints, + _max_edit_distance).has_value(); +} + +vespalib::string vespalib::FuzzyMatcher::getPrefix() const { + vespalib::string prefix; + Utf8Writer writer(prefix); + for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) { + writer.putChar(_folded_term_codepoints.at(i)); + } + return prefix; +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h new file mode 100644 index 00000000000..bfcf98d79fd --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h @@ -0,0 +1,59 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <string_view> +#include <vector> + +#include <vespa/vespalib/stllike/string.h> + +namespace vespalib { + +/** + * Fuzzy matching between a lowercased instances of query and document terms based on Levenstein distance. + * Class has two main parameters: + * - prefix size, i.e the size of the prefix that is considered frozen. + * - max edit distance, i.e an upper bound for edit distance for it to be a match between terms + * Note that prefix size is not impacting matching logic, + * it's expected to be used to generate prefix for the lookup in the BTree for the fast-search logic. + * But there is no code to actually enforcing it. + */ +class FuzzyMatcher { +private: + static constexpr uint8_t DefaultPrefixSize = 0u; + static constexpr uint8_t DefaultMaxEditDistance = 2u; + + std::vector<uint32_t> _folded_term_codepoints; + + uint8_t _max_edit_distance; // max edit distance + uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy + +public: + FuzzyMatcher(): + _folded_term_codepoints(), + _max_edit_distance(DefaultMaxEditDistance), + _prefix_size(DefaultPrefixSize) + {} + + FuzzyMatcher(std::vector<uint32_t> codepoints): + _folded_term_codepoints(std::move(codepoints)), + _max_edit_distance(DefaultMaxEditDistance), + _prefix_size(DefaultPrefixSize) + {} + + FuzzyMatcher(std::vector<uint32_t> codepoints, uint8_t max_edit_distance, uint8_t prefix_size): + _folded_term_codepoints(std::move(codepoints)), + _max_edit_distance(max_edit_distance), + _prefix_size(prefix_size) + {} + + [[nodiscard]] bool isMatch(std::string_view target) const; + + [[nodiscard]] vespalib::string getPrefix() const; + + /// + + static FuzzyMatcher from_term(std::string_view term); + +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp index f4744101a97..5ee0f2d4a2d 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp @@ -1,43 +1,16 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. // Levenstein distance algorithm is based off Java implementation from apache commons-text library licensed under the Apache 2.0 license. -#include "fuzzy.h" +#include "levenstein_distance.h" #include <limits> -#include <vespa/vespalib/text/utf8.h> -#include <vespa/vespalib/text/lowercase.h> - -vespalib::Fuzzy vespalib::Fuzzy::from_term(std::string_view term) { - return Fuzzy(folded_codepoints(term)); -} - -std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(const char* src, size_t srcSize) { - std::vector<uint32_t> result; - result.reserve(srcSize); - Utf8ReaderForZTS srcReader(src); - while (srcReader.hasMore()) { - result.emplace_back(LowerCase::convert(srcReader.getChar())); - } - return result; -} - -std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(std::string_view src) { - return folded_codepoints(src.data(), src.size()); -} - -std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold) { - std::vector<uint32_t> sourceCodepoints = folded_codepoints(source.data(), source.size()); - std::vector<uint32_t> targetCodepoints = folded_codepoints(target.data(), target.size()); - return levenstein_distance(sourceCodepoints, targetCodepoints, threshold); -} - -std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) { +std::optional<uint32_t> vespalib::LevensteinDistance::calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) { uint32_t n = left.size(); uint32_t m = right.size(); if (n > m) { - return levenstein_distance(right, left, threshold); + return calculate(right, left, threshold); } // if one string is empty, the edit distance is necessarily the length @@ -83,7 +56,6 @@ std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<u n : std::min(n, j + threshold); // ignore entry left of leftmost - // printf("j = %d, min = %d, max = %d, n = %d\n", j, min, max, n); if (min > 1) { d[min - 1] = std::numeric_limits<uint32_t>::max(); } @@ -112,17 +84,3 @@ std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<u } return std::nullopt; } - -bool vespalib::Fuzzy::isMatch(std::string_view src) const { - std::vector<uint32_t> srcCodepoints = folded_codepoints(src); - return levenstein_distance(_folded_term_codepoints, srcCodepoints, _edit_distance).has_value(); -} - -vespalib::string vespalib::Fuzzy::getPrefix() const { - vespalib::string prefix; - Utf8Writer writer(prefix); - for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) { - writer.putChar(_folded_term_codepoints.at(i)); - } - return prefix; -} diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h new file mode 100644 index 00000000000..e109db3178c --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h @@ -0,0 +1,15 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <optional> +#include <cstdint> +#include <vector> + +namespace vespalib { + +class LevensteinDistance { +public: + static std::optional<uint32_t> calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold); +}; + +} |