diff options
author | Alexey Chernyshev <aleksei@spotify.com> | 2022-03-10 16:33:07 +0100 |
---|---|---|
committer | Alexey Chernyshev <aleksei@spotify.com> | 2022-03-23 16:20:59 +0100 |
commit | d9805209e3b0e33be3c0cc454c4604043663c1c4 (patch) | |
tree | 7446c79f68acd8775233ace4d5a70058f90c8406 /vespalib | |
parent | a2b1e6654cabc90ddf7422e58adf641876e5201c (diff) |
Introducing fuzzy search
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/CMakeLists.txt | 2 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/.gitignore | 4 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/CMakeLists.txt | 8 | ||||
-rw-r--r-- | vespalib/src/tests/fuzzy/fuzzy.cpp | 33 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/CMakeLists.txt | 1 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt | 7 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp | 144 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/fuzzy.h | 60 |
8 files changed, 259 insertions, 0 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index 29a0c1e7bb8..c3c7fee2cef 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -62,6 +62,7 @@ vespa_define_module( src/tests/explore_modern_cpp src/tests/false src/tests/fiddle + src/tests/fuzzy src/tests/gencnt src/tests/guard src/tests/host_name @@ -169,6 +170,7 @@ vespa_define_module( src/vespa/vespalib/data src/vespa/vespalib/data/slime src/vespa/vespalib/datastore + src/vespa/vespalib/fuzzy src/vespa/vespalib/geo src/vespa/vespalib/hwaccelrated src/vespa/vespalib/io diff --git a/vespalib/src/tests/fuzzy/.gitignore b/vespalib/src/tests/fuzzy/.gitignore new file mode 100644 index 00000000000..cd5ca9b3b88 --- /dev/null +++ b/vespalib/src/tests/fuzzy/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +fuzzy_test +vespalib_fuzzy_test_app diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt new file mode 100644 index 00000000000..f58602296a5 --- /dev/null +++ b/vespalib/src/tests/fuzzy/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_fuzzy_test_app TEST + SOURCES + fuzzy.cpp + DEPENDS + vespalib + ) +vespa_add_test(NAME vespalib_fuzzy_test_app COMMAND vespalib_fuzzy_test_app) diff --git a/vespalib/src/tests/fuzzy/fuzzy.cpp b/vespalib/src/tests/fuzzy/fuzzy.cpp new file mode 100644 index 00000000000..9ffb77b3742 --- /dev/null +++ b/vespalib/src/tests/fuzzy/fuzzy.cpp @@ -0,0 +1,33 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/vespalib/fuzzy/fuzzy.h> + +using namespace vespalib; + + +TEST("require that levenstein distance works") { + EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "abc", 2).value()); + EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "ABC", 2).value()); + EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("abc", "abd", 2).value()); + EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("ABC", "abd", 2).value()); + EXPECT_EQUAL(2u, Fuzzy::levenstein_distance("ABC", "add", 2).value()); + EXPECT_FALSE(Fuzzy::levenstein_distance("ABC", "ddd", 2).has_value()); +} + +TEST("require that extracting of a prefix works") { + Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 2, 2); + EXPECT_EQUAL("pr", fuzzy.getPrefix()); +} + +TEST("require that empty prefix works") { + Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 0, 2); + EXPECT_EQUAL("", fuzzy.getPrefix()); +} + +TEST("require that longer prefix size works") { + Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 100, 2); + EXPECT_EQUAL("prefix", fuzzy.getPrefix()); +} + + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt index dea0f509bad..1fffddf2f2f 100644 --- a/vespalib/src/vespa/vespalib/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/CMakeLists.txt @@ -7,6 +7,7 @@ vespa_add_library(vespalib $<TARGET_OBJECTS:vespalib_vespalib_data> $<TARGET_OBJECTS:vespalib_vespalib_data_slime> $<TARGET_OBJECTS:vespalib_vespalib_datastore> + $<TARGET_OBJECTS:vespalib_vespalib_fuzzy> $<TARGET_OBJECTS:vespalib_vespalib_geo> $<TARGET_OBJECTS:vespalib_vespalib_hwaccelrated> $<TARGET_OBJECTS:vespalib_vespalib_io> diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt new file mode 100644 index 00000000000..0c6c8f5c446 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt @@ -0,0 +1,7 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(vespalib_vespalib_fuzzy OBJECT + SOURCES + fuzzy.cpp + DEPENDS + ) + diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp new file mode 100644 index 00000000000..55d11b74123 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Levenstein distance algorithm is based off Java implementation from apache commons-text library + */ + +#include "fuzzy.h" + +#include <limits> + +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/text/lowercase.h> + +vespalib::Fuzzy vespalib::Fuzzy::from_term(std::string_view term) { + return Fuzzy(folded_codepoints(term)); +} + +std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(const char* src, size_t srcSize) { + std::vector<uint32_t> result; + result.reserve(srcSize); + Utf8ReaderForZTS srcReader(src); + while (srcReader.hasMore()) { + result.emplace_back(LowerCase::convert(srcReader.getChar())); + } + return result; +} + +std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(std::string_view src) { + return folded_codepoints(src.data(), src.size()); +} + +std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold) { + std::vector<uint32_t> sourceCodepoints = folded_codepoints(source.data(), source.size()); + std::vector<uint32_t> targetCodepoints = folded_codepoints(target.data(), target.size()); + return levenstein_distance(sourceCodepoints, targetCodepoints, threshold); +} + +std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) { + uint32_t n = left.size(); + uint32_t m = right.size(); + + if (n > m) { + return levenstein_distance(right, left, threshold); + } + + // if one string is empty, the edit distance is necessarily the length + // of the other + if (n == 0) { + return m <= threshold ? std::optional(m) : std::nullopt; + } + if (m == 0) { + return n <= threshold ? std::optional(n) : std::nullopt; + } + + + // the edit distance cannot be less than the length difference + if (m - n > threshold) { + return std::nullopt; + } + + std::vector<uint32_t> p(n+1); // 'previous' cost array, horizontally + std::vector<uint32_t> d(n+1); // cost array, horizontally + + const uint32_t boundary = std::min(n, threshold) + 1; + + for (uint32_t i = 0; i < boundary; ++i) { + p[i] = i; + } + // these fills ensure that the value above the rightmost entry of our + // stripe will be ignored in following loop iterations + for (uint32_t i = boundary; i < p.size(); ++i) { + p[i] = std::numeric_limits<uint32_t>::max(); + } + for (uint32_t i = 0; i < d.size(); ++i) { + d[i] = std::numeric_limits<uint32_t>::max(); + } + + // iterates through t + for (uint32_t j = 1; j <= m; ++j) { + uint32_t rightJ = right[j - 1]; // jth character of right + d[0] = j; + + int32_t min = std::max(1, static_cast<int32_t>(j) - static_cast<int32_t>(threshold)); + + uint32_t max = j > std::numeric_limits<uint32_t>::max() - threshold ? + n : std::min(n, j + threshold); + + // ignore entry left of leftmost + // printf("j = %d, min = %d, max = %d, n = %d\n", j, min, max, n); + if (min > 1) { + d[min - 1] = std::numeric_limits<uint32_t>::max(); + } + + uint32_t lowerBound = std::numeric_limits<uint32_t>::max(); + + for (uint32_t i = min; i <= max; ++i) { + if (left[i - 1] == rightJ) { + // diagonally left and up + d[i] = p[i - 1]; + } else { + // 1 + minimum of cell to the left, to the top, diagonally + // left and up + + d[i] = 1 + std::min(std::min(d[i - 1], p[i]), p[i - 1]); + } + lowerBound = std::min(lowerBound, d[i]); + } + if (lowerBound > threshold) { + return std::nullopt; + } + std::swap(p, d); + } + if (p[n] <= threshold) { + return {p[n]}; + } + return std::nullopt; +} + +bool vespalib::Fuzzy::isMatch(std::string_view src) const { + std::vector<uint32_t> srcCodepoints = folded_codepoints(src); + return levenstein_distance(_folded_term_codepoints, srcCodepoints, _edit_distance).has_value(); +} + +vespalib::string vespalib::Fuzzy::getPrefix() const { + vespalib::string prefix; + Utf8Writer writer(prefix); + for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) { + writer.putChar(_folded_term_codepoints.at(i)); + } + return prefix; +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h new file mode 100644 index 00000000000..775c25445e3 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h @@ -0,0 +1,60 @@ +#pragma once + +#include <string_view> +#include <utility> +#include <vector> +#include <optional> + +#include <vespa/vespalib/stllike/string.h> + +namespace vespalib { + +class Fuzzy { +private: + static constexpr uint8_t DefaultPrefixSize = 0u; + static constexpr uint8_t DefaultEditDistance = 2u; + + std::vector<uint32_t> _folded_term_codepoints; + + uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e non-fuzzy + uint8_t _edit_distance; // max edit distance + + +public: + Fuzzy(): + _folded_term_codepoints(), + _prefix_size(DefaultPrefixSize), + _edit_distance(DefaultEditDistance) + {} + + Fuzzy(std::vector<uint32_t> codepoints): + _folded_term_codepoints(std::move(codepoints)), + _prefix_size(DefaultPrefixSize), + _edit_distance(DefaultEditDistance) + {} + + Fuzzy(std::vector<uint32_t> codepoints, uint8_t prefix_size, uint8_t edit_distance): + _folded_term_codepoints(std::move(codepoints)), + _prefix_size(prefix_size), + _edit_distance(edit_distance) + {} + + [[nodiscard]] bool isMatch(std::string_view src) const; + + [[nodiscard]] vespalib::string getPrefix() const; + + /// + + static Fuzzy from_term(std::string_view term); + + static std::optional<uint32_t> levenstein_distance(const std::vector<uint32_t>& source, const std::vector<uint32_t>& target, uint32_t threshold); + + static std::optional<uint32_t> levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold); + + static std::vector<uint32_t> folded_codepoints(const char* src, size_t srcSize); + + static std::vector<uint32_t> folded_codepoints(std::string_view src); + +}; + +} |