diff options
author | Tor Brede Vekterli <vekterli@vespa.ai> | 2024-04-19 12:47:00 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-19 12:47:00 +0200 |
commit | 06b7bccd5586ed353069593a71535e4c958bc63e (patch) | |
tree | 40c7b3d9a9759bdffa1bff6c862dd6546e80c8cd /vespalib | |
parent | bfa5d861a4492f9935c16a1596b30f7889143455 (diff) | |
parent | 0e8aab2c447e0ed5cac035d385c95ad44d8594d6 (diff) |
Merge pull request #30932 from vespa-engine/vekterli/levenshtein-prefix-matching-algo-support
Add prefix match support to Levenshtein algorithm implementations
Diffstat (limited to 'vespalib')
14 files changed, 379 insertions, 69 deletions
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp index 69b34ece2c7..dfd1b3b0c3b 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -24,12 +24,13 @@ using CasingAndDfaType = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaT namespace { std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold, - LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type, + LevenshteinDfa::Matching matching) { - auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type); + auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type, matching); auto maybe_match_lhs = dfa_lhs.match(right); - auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type); + auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type, matching); auto maybe_match_rhs = dfa_rhs.match(left); EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches()); @@ -40,6 +41,12 @@ std::optional<uint32_t> do_calculate(std::string_view left, std::string_view rig return std::nullopt; } +std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold, + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) +{ + return do_calculate(left, right, threshold, casing, dfa_type, LevenshteinDfa::Matching::FullString); +} + std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold, LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) { @@ -74,6 +81,16 @@ struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> { return do_calculate(left, right, threshold, casing(), dfa_type()); } + static std::optional<uint32_t> prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) { + // Prefix matching is not symmetric, cannot use do_calculate() as it implicitly checks this. + auto dfa = LevenshteinDfa::build(left, threshold, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix); + auto maybe_match = dfa.match(right); + if (maybe_match.matches()) { + return {maybe_match.edits()}; + } + return std::nullopt; + } + }; // All the baseline DFA tests use lowercase only, so they should have the exact same outcome @@ -98,6 +115,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { EXPECT_EQ(calculate("abc", "ab", max), std::optional{1}) << max; EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max; EXPECT_EQ(calculate("a", "", max), std::optional{1}) << max; + EXPECT_EQ(calculate("", "", max), std::optional{0}) << max; } EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2}); EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2}); @@ -109,6 +127,7 @@ TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); EXPECT_EQ(calculate("abc", "", 2), std::nullopt); EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt); } TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { @@ -124,6 +143,37 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt); } +TEST_P(LevenshteinDfaTest, can_match_in_target_string_prefix_mode) { + for (auto max : {1, 2}) { + EXPECT_EQ(prefix_calculate("", "literally anything", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("", "", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("x", "", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abc", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcd", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcdef", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "ab", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("ac", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acd", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("bc", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acb", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acb", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abd", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbc", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xy", max), std::nullopt); + } + EXPECT_EQ(prefix_calculate("abc", "xxabc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt); +} + void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor, std::string_view successor_prefix) { @@ -182,11 +232,22 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings "\xf0\x9f\xa4\xa9""food"); // <starry eyed emoji>food // Note that as a general rule, emojis are fickle beasts to deal with since a single - // emoji often takes up multiple code points, which we consider separate characters - // but a user sees as a single actual rendered glyph. + // emoji often takes up multiple code points. We consider these as separate characters + // but from a Unicode perspective they are considered part of an "extended grapheme + // cluster", from which the actually rendered glyph is derived. // Multi-code point character edit distance support is left as an exercise for the reader :D } +TEST_P(LevenshteinDfaTest, can_generate_successors_with_prefix_match_mode) { + auto dfa = LevenshteinDfa::build("ban", 1, casing(), dfa_type(), LevenshteinDfa::Matching::Prefix); + // There is no difference at all in successor output when using Prefix mode as opposed to + // FullString mode, but since we match _more_ source strings in prefix mode we cannot + // simply run the same test set for both modes. This just tests that the wiring seems fine. + test_dfa_successor(dfa, "", "\x01""an"); + test_dfa_successor(dfa, "xy", "yan"); + test_dfa_successor(dfa, "bolle", "bon"); +} + TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) { auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); // The successor string must be lexicographically larger than the input string. @@ -332,13 +393,21 @@ std::string bits_to_str(T v) { return ret; } -using CasingAndDfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType, uint32_t>; +using CasingAndDfaTypeAndMaxEdits = std::tuple< + LevenshteinDfa::Casing, + LevenshteinDfa::DfaType, + LevenshteinDfa::Matching, + uint32_t +>; struct LevenshteinDfaSuccessorTest : TestWithParam<CasingAndDfaTypeAndMaxEdits> { - // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number. + // Print test suffix as e.g. "/Uncased_Explicit_FullString_1" instead of just a GTest-chosen number. static std::string stringify_params(const TestParamInfo<ParamType>& info) { std::ostringstream ss; - ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param); + ss << std::get<0>(info.param) + << '_' << std::get<1>(info.param) + << '_' << std::get<2>(info.param) + << '_' << std::get<3>(info.param); return ss.str(); } }; @@ -350,6 +419,8 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, Values(LevenshteinDfa::DfaType::Explicit, LevenshteinDfa::DfaType::Implicit, LevenshteinDfa::DfaType::Table), + Values(LevenshteinDfa::Matching::FullString, + LevenshteinDfa::Matching::Prefix), Values(1, 2)), LevenshteinDfaSuccessorTest::stringify_params); @@ -369,10 +440,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, * Inspired by approach used by Lucene DFA exhaustive testing. */ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) { - const auto [casing, dfa_type, max_edits] = GetParam(); + const auto [casing, dfa_type, matching, max_edits] = GetParam(); for (uint32_t i = 0; i < 256; ++i) { const auto target = bits_to_str(static_cast<uint8_t>(i)); - auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type); + auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type, matching); std::string skip_to, successor; for (uint32_t j = 0; j < 256; ++j) { const auto source = bits_to_str(static_cast<uint8_t>(j)); diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp index 37d1d4b4c54..918cbc624db 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp @@ -16,6 +16,13 @@ std::optional<uint32_t> calculate(std::string_view left, std::string_view right, return leftRight; } +// Prefix matching is asymmetric and therefore cannot implicitly test result symmetry +std::optional<uint32_t> prefix_calculate(std::string_view left, std::string_view right, uint32_t threshold) { + auto left_codepoints = vespalib::LowerCase::convert_to_ucs4(left); + auto right_codepoints = vespalib::LowerCase::convert_to_ucs4(right); + return vespalib::LevenshteinDistance::calculate(left_codepoints, right_codepoints, threshold, true); +} + TEST(LevenshteinDistance, calculate_edgecases) { EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0}); EXPECT_EQ(calculate("abc", "ab1", 2), std::optional{1}); @@ -33,6 +40,54 @@ TEST(LevenshteinDistance, calculate_edgecases) { EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); EXPECT_EQ(calculate("abc", "", 2), std::nullopt); EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("abcde", "xad", 2), std::nullopt); +} + +TEST(LevenshteinDistance, prefix_match_edge_cases) { + // Same cases as LevenshteinDfaTest (TODO consolidate these somehow) + for (auto max : {1, 2}) { + EXPECT_EQ(prefix_calculate("", "literally anything", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("", "", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("x", "", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abc", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcd", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcdef", max), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "ab", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("ac", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acd", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xabcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("bc", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acb", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "acdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("acb", "abcdef", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abd", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdcfgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "abdefgh", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbc", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xbcdefg", max), std::optional{1}); + EXPECT_EQ(prefix_calculate("abc", "xy", max), std::nullopt); + } + EXPECT_EQ(prefix_calculate("abc", "xxabc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abc", "xxabcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abc", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("abcxx", "abcd", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2}); + EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt); + + // Max edits > 2 cases; not supported by DFA implementation. + EXPECT_EQ(prefix_calculate("abc", "", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abc", "xy", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abc", "xyz", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abc", "xyzzz", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abcd", "xyzd", 3), std::optional{3}); + EXPECT_EQ(prefix_calculate("abcd", "xyzz", 3), std::nullopt); + EXPECT_EQ(prefix_calculate("abcd", "", 3), std::nullopt); +} + +TEST(LevenshteinDistance, oversized_max_edits_is_well_defined) { + const auto k = uint32_t(INT32_MAX) + 10000u; + EXPECT_EQ(calculate("abc", "xyz", k), std::optional{3}); + EXPECT_EQ(prefix_calculate("abc", "xyzzzz", k), std::optional{3}); } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h index c69f414aae6..8b2597dd179 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h @@ -20,6 +20,12 @@ concept DfaMatcher = requires(T a, std::string u8str, std::vector<uint32_t> u32s // matching to have the expected semantics, the actual target string must be pre-lowercased. { a.is_cased() } -> std::same_as<bool>; + // Whether the matching is performed in prefix mode. In prefix mode, a source string is + // considered a match if it matches the target string at any point during DFA traversal. + // I.e. the source string "bananas" prefix-matched against the target (prefix) "ban" will + // be returned as a match with 0 edits. + { a.is_prefix() } -> std::same_as<bool>; + // Initial (starting) state of the DFA { a.start() } -> std::same_as<typename T::StateType>; diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h index 490582b5bf7..a3542d61dc8 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h @@ -97,9 +97,11 @@ public: private: std::vector<DfaNodeType> _nodes; const bool _is_cased; + const bool _is_prefix; public: - explicit ExplicitLevenshteinDfaImpl(bool is_cased) noexcept - : _is_cased(is_cased) + ExplicitLevenshteinDfaImpl(bool is_cased, bool is_prefix) noexcept + : _is_cased(is_cased), + _is_prefix(is_prefix) {} ~ExplicitLevenshteinDfaImpl() override; @@ -140,10 +142,12 @@ template <typename Traits> class ExplicitLevenshteinDfaBuilder { const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string const bool _is_cased; + const bool _is_prefix; public: - ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str, bool is_cased) noexcept + ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str, bool is_cased, bool is_prefix) noexcept : _u32_str_buf(std::move(str)), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) {} [[nodiscard]] LevenshteinDfa build_dfa() const; diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp index 55dd459ff26..11a822d2d7e 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp @@ -22,16 +22,20 @@ struct ExplicitDfaMatcher { const std::span<const DfaNodeType> _nodes; const bool _is_cased; + const bool _is_prefix; - ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes, bool is_cased) noexcept + ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes, bool is_cased, bool is_prefix) noexcept : _nodes(nodes), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) {} static constexpr uint8_t max_edits() noexcept { return MaxEdits; } bool is_cased() const noexcept { return _is_cased; } + bool is_prefix() const noexcept { return _is_prefix; } + StateType start() const noexcept { return &_nodes[0]; } @@ -99,21 +103,21 @@ ExplicitLevenshteinDfaImpl<MaxEdits>::~ExplicitLevenshteinDfaImpl() = default; template <uint8_t MaxEdits> LevenshteinDfa::MatchResult ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str) const { - ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased); + ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix); return MatchAlgorithm<MaxEdits>::match(matcher, u8str); } template <uint8_t MaxEdits> LevenshteinDfa::MatchResult ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string& successor_out) const { - ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased); + ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix); return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out); } template <uint8_t MaxEdits> LevenshteinDfa::MatchResult ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const { - ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased); + ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased, _is_prefix); return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out); } @@ -199,10 +203,12 @@ class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase<Traits> { using Base::transitions; const bool _is_cased; + const bool _is_prefix; public: - ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str, bool is_cased) noexcept + ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str, bool is_cased, bool is_prefix) noexcept : DfaSteppingBase<Traits>(str), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) { assert(str.size() < UINT32_MAX / max_out_edges_per_node()); } @@ -217,7 +223,7 @@ public: template <typename Traits> LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const { - auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(_is_cased); + auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(_is_cased, _is_prefix); ExploreState<StateType> exp; // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_ // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth. @@ -257,7 +263,7 @@ LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const { template <typename Traits> LevenshteinDfa ExplicitLevenshteinDfaBuilder<Traits>::build_dfa() const { - ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf, _is_cased); + ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf, _is_cased, _is_prefix); return builder.build_dfa(); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h index 20c8e1b7e2b..552fd849734 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h @@ -13,14 +13,16 @@ class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl { std::string _target_as_utf8; std::vector<uint32_t> _target_utf8_char_offsets; const bool _is_cased; + const bool _is_prefix; public: using MatchResult = LevenshteinDfa::MatchResult; - ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased) + ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix) : _u32_str_buf(std::move(str)), _target_as_utf8(), _target_utf8_char_offsets(), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) { precompute_utf8_target_with_offsets(); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp index 05ef2761f34..7dadcc59b8e 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp @@ -32,21 +32,26 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> { std::span<const char> _target_as_utf8; std::span<const uint32_t> _target_utf8_char_offsets; const bool _is_cased; + const bool _is_prefix; ImplicitDfaMatcher(std::span<const uint32_t> u32_str, std::span<const char> target_as_utf8, std::span<const uint32_t> target_utf8_char_offsets, - bool is_cased) noexcept + bool is_cased, + bool is_prefix) noexcept : Base(u32_str), _target_as_utf8(target_as_utf8), _target_utf8_char_offsets(target_utf8_char_offsets), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) {} // start, is_match, can_match, match_edit_distance are all provided by base type bool is_cased() const noexcept { return _is_cased; } + bool is_prefix() const noexcept { return _is_prefix; } + template <typename F> bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) { for (uint32_t i = 0; i < state.size(); ++i) { @@ -137,21 +142,21 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> { template <typename Traits> LevenshteinDfa::MatchResult ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str) const { - ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); + ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix); return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str); } template <typename Traits> LevenshteinDfa::MatchResult ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string& successor_out) const { - ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); + ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix); return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out); } template <typename Traits> LevenshteinDfa::MatchResult ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const { - ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); + ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased, _is_prefix); return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp index 19c2fffbb3e..468619c8036 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp @@ -41,34 +41,40 @@ void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const { _impl->dump_as_graphviz(out); } -LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) { +LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, + Casing casing, DfaType dfa_type, Matching matching) { if (max_edits != 1 && max_edits != 2) { throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits)); } - const bool is_cased = (casing == Casing::Cased); + const bool is_cased = (casing == Casing::Cased); + const bool is_prefix = (matching == Matching::Prefix); auto target_string_u32 = is_cased ? utf8_string_to_utf32(target_string) : utf8_string_to_utf32_lowercased(target_string); if (dfa_type == DfaType::Implicit) { if (max_edits == 1) { - return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased)); + return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased, is_prefix)); } else { // max_edits == 2 - return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased)); + return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased, is_prefix)); } } else if(dfa_type == DfaType::Explicit) { if (max_edits == 1) { - return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa(); + return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased, is_prefix).build_dfa(); } else { // max_edits == 2 - return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa(); + return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased, is_prefix).build_dfa(); } } else { // DfaType::Table if (max_edits == 1) { - return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased)); + return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased, is_prefix)); } else { // max_edits == 2 - return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased)); + return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased, is_prefix)); } } } +LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) { + return build(target_string, max_edits, casing, dfa_type, Matching::FullString); +} + LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) { // TODO automatically select implementation based on target length/max edits? // Suggestion: @@ -112,4 +118,12 @@ std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c) { return os; } +std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Matching m) { + switch (m) { + case LevenshteinDfa::Matching::FullString: os << "FullString"; return os; + case LevenshteinDfa::Matching::Prefix: os << "Prefix"; return os; + } + abort(); +} + } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h index 44c62bdb957..feace39b313 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -265,6 +265,33 @@ public: Cased }; + enum class Matching { + /** + * Edit distance is computed based on the _entire_ source string. Matching is + * symmetric between source and target strings, i.e. match(x, y) and match(y, x) + * will yield the same result. + */ + FullString, + /** + * Edit distance is computed based on a _prefix_ of the source string, as compared + * against the target string. Matching is therefore _asymmetric_ between source and + * target strings. + * + * Example of matching source strings against the target 'ban' (i.e. the prefix query) + * and 1 max edit distance: + * 'bananas' - 0 edits (source prefix 'ban' exact-matches 'ban') + * 'balloons' - 1 edit ('bal' vs 'ban') + * '2bananas' - 1 edit ('2ban' vs 'ban') + * 'boonanas' - mismatch (2 edits) + * + * Note that Prefix matching will match a lot more strings than FullString, so in + * practice it should be combined with prefix _locking_ to constrain the candidate + * result set to a more reasonable cardinality. In particular, max edits >= |target| + * will match _every_ source string trivially. + */ + Prefix + }; + /** * Builds and returns a Levenshtein DFA that matches all strings within `max_edits` * edits of `target_string`. The type of DFA returned is specified by dfa_type. @@ -274,6 +301,9 @@ public: * `target_string` must not contain any null UTF-8 chars. */ [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits, + Casing casing, DfaType dfa_type, Matching matching); + + [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type); /** @@ -301,5 +331,6 @@ public: std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos); std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt); std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c); +std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Matching m); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp index 4658635d880..f5b8d0fc0b8 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp @@ -3,31 +3,46 @@ #include "levenshtein_distance.h" +#include <cassert> #include <limits> #include <vector> +namespace vespalib { + std::optional<uint32_t> -vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold) +LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, + uint32_t threshold, bool prefix_match) { + assert(left.size() <= static_cast<size_t>(INT32_MAX)); + assert(right.size() <= static_cast<size_t>(INT32_MAX)); + threshold = std::min(threshold, static_cast<uint32_t>(std::numeric_limits<int32_t>::max())); uint32_t n = left.size(); uint32_t m = right.size(); - if (n > m) { - return calculate(right, left, threshold); - } - - // if one string is empty, the edit distance is necessarily the length - // of the other - if (n == 0) { - return m <= threshold ? std::optional(m) : std::nullopt; - } - if (m == 0) { - return n <= threshold ? std::optional(n) : std::nullopt; - } - - // the edit distance cannot be less than the length difference - if (m - n > threshold) { - return std::nullopt; + if (!prefix_match) { + // These optimizations are only valid when matching with target/source string symmetry. + // Correctness of the main matrix calculation loop should not depend on these. + if (n > m) { + return calculate(right, left, threshold, false); + } + // if one string is empty, the edit distance is necessarily the length + // of the other. + if (n == 0) { + return m <= threshold ? std::optional(m) : std::nullopt; + } + if (m == 0) { + return n <= threshold ? std::optional(n) : std::nullopt; + } + // the edit distance cannot be less than the length difference + if (m - n > threshold) { + return std::nullopt; + } + } else { + // A source (right) cannot be transformed into a target prefix (left) if doing + // so would require inserting more than max edits number of characters. + if ((n > m) && (n - m > threshold)) { + return std::nullopt; + } } std::vector<uint32_t> p(n+1); // 'previous' cost array, horizontally @@ -48,6 +63,7 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp } // iterates through t + uint32_t min_edits = n; // prefix matching: worst-case to transform to target for (uint32_t j = 1; j <= m; ++j) { uint32_t rightJ = right[j - 1]; // jth character of right d[0] = j; @@ -56,9 +72,9 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp uint32_t max = j > std::numeric_limits<uint32_t>::max() - threshold ? n : std::min(n, j + threshold); - // ignore entry left of leftmost if (min > 1) { + assert(static_cast<size_t>(min) <= d.size()); d[min - 1] = std::numeric_limits<uint32_t>::max(); } @@ -76,12 +92,35 @@ vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::sp lowerBound = std::min(lowerBound, d[i]); } if (lowerBound > threshold) { - return std::nullopt; + if (!prefix_match) { + return std::nullopt; // Cannot match + } else { + break; // May already have matched via min_edits + } } std::swap(p, d); + // For prefix matching: + // By definition, the Levenshtein matrix cell at row `i`, column `j` + // provides the minimum number of edits required to transform a prefix of + // source string S (up to and including length `i`) into a prefix of target + // string T (up to and including length `j`). Since we want to match against + // the entire target (prefix query) string with length `n`, the problem is + // reduced to finding the minimum value of the `n`th column that is `<= k` + // (aggregated here and checked after the loop). + min_edits = std::min(p[n], min_edits); } - if (p[n] <= threshold) { + if (prefix_match) { + return ((min_edits <= threshold) ? std::optional<uint32_t>{min_edits} : std::nullopt); + } else if (p[n] <= threshold) { return {p[n]}; } return std::nullopt; } + +std::optional<uint32_t> +LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold) +{ + return calculate(left, right, threshold, false); +} + +} // vespalib diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h index f105dcdaaf4..e284e071731 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h @@ -19,7 +19,15 @@ namespace vespalib { */ class LevenshteinDistance { public: - static std::optional<uint32_t> calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold); + // Iff `prefix_match` == true, `right` is the candidate to match against prefix `left` + static std::optional<uint32_t> calculate(std::span<const uint32_t> left, + std::span<const uint32_t> right, + uint32_t threshold, + bool prefix_match); + + static std::optional<uint32_t> calculate(std::span<const uint32_t> left, + std::span<const uint32_t> right, + uint32_t threshold); }; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp index 388099a0358..e8b1bb5b415 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp @@ -25,11 +25,14 @@ struct MatchAlgorithm { static constexpr uint8_t max_edits() noexcept { return MaxEdits; } /** - * Matches UTF-8 source string `source` against the target DFA, optionally generating + * Matches UTF-8/32 source string `source` against the target DFA, optionally generating * the successor string iff the source string is not within the maximum number of edits * of the target string. * - * The actual match loop is very simple: we try to match the DFA as far as we can + * The actual match loop is very simple, but with some subtle differences in semantics + * depending on whether the "full string" or "prefix" matching mode is used. + * + * For the full string matching mode, we try to match the DFA as far as we can * before either consuming all input (source string) characters or ending up in a non- * matching state before we have consumed all input. In the former case, we may be in * a matching state (consider matching "foo" with the target string "food"; after @@ -39,6 +42,15 @@ struct MatchAlgorithm { * If we end up in a matching state, all is well. We simply return a MatchResult with * the number of edits the state represents. * + * Prefix matching is very similar, but since we're interested in the number of edits + * required to transform a _prefix_ of the source into the target string, we cannot + * require matching the entire source string in the case that it is longer than the + * target (as that would not just be a prefix, after all). We consider a source prefix + * to be matched if _any_ DFA state is a match. This may be the initial state. + * Since the first match might not yield the minimum edit distance, we traverse the + * DFA until it no longer matches and return the smallest encountered distance as the + * final result. + * * The interesting bit happens the string does _not_ match and we are asked to provide a * _successor_ string that _does_ match and is strictly greater in lexicographic order. * @@ -143,6 +155,18 @@ struct MatchAlgorithm { * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different * ordering when compared byte-wise against an implicitly lowercased dictionary. * + * The successor generation algorithm generalizes without changes to work as expected + * when prefix matching is used. This is because the target string represents the + * prefix and the successor consequently represents the smallest possible greater prefix. + * + * Proof by contradiction: S is a source string that does _not_ prefix match target + * string T, and S' is the returned successor. Assume that discarding (skipping) all + * candidate source strings C with S < C < S' misses at least one such candidate C' + * that has a prefix matching T. This means that it must be possible to traverse the + * DFA with source string C' and end up in a matching state. But by definition the + * successor S' is the smallest possible lexicographically greater string that can + * possibly match the DFA for T; a contradiction. + * * TODO let matcher know if source string is pre-normalized (i.e. lowercased). */ template <DfaMatcher Matcher, typename SuccessorT> @@ -158,6 +182,9 @@ struct MatchAlgorithm { StateType state = matcher.start(); while (u8_reader.hasMore()) { + if (matcher.is_prefix() && matcher.is_match(state)) { + return minimal_matching_prefix_distance(matcher, state, u8_reader); + } const auto pos_before_char = static_cast<uint32_t>(successor_out.size()); const uint32_t raw_mch = u8_reader.getChar(); const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased()); @@ -198,6 +225,9 @@ struct MatchAlgorithm { Utf8Reader u8_reader(source.data(), source.size()); StateType state = matcher.start(); while (u8_reader.hasMore()) { + if (matcher.is_prefix() && matcher.is_match(state)) { + return minimal_matching_prefix_distance(matcher, state, u8_reader); + } const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased()); auto maybe_next = matcher.match_input(state, mch); if (matcher.can_match(maybe_next)) { @@ -214,6 +244,41 @@ struct MatchAlgorithm { } /** + * Follows the DFA (in an already matching state) until it no longer matches, keeping + * track of the minimum edit distance encountered. This is used to compute the minimum + * number of edits during prefix matching, as just returning the edit distance of the + * _first_ matching state may not result in the minimum. + * + * Example 1: matching the source string 'bananas' against the prefix target 'ban' with + * max edits 2 will be in a matching state already after consuming 'b' (can append 2 + * characters after), but had we processed the remaining characters we would end up with + * the actual prefix edit distance of 0. + * + * Example 2: matching the source string 'abcdef' against target 'acd' with max edits 2 + * will be in a matching state after 'a' (2 edits), but the minimal edit prefix is 'abcd' + * (1 edit). This demonstrates that we may have to process more than |target| number of + * source characters to get a minimal distance. + */ + template <DfaMatcher Matcher> + static MatchResult minimal_matching_prefix_distance(const Matcher& matcher, + typename Matcher::StateType state, + Utf8Reader& u8_reader) + { + auto min_edits = matcher.match_edit_distance(state); + while (u8_reader.hasMore()) { + const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased()); + auto maybe_next = matcher.match_input(state, mch); + if (matcher.can_match(maybe_next)) { + state = maybe_next; + min_edits = std::min(min_edits, matcher.match_edit_distance(state)); + } else { + break; + } + } + return MatchResult::make_match(max_edits(), min_edits); + } + + /** * Instantly backtrack to the last possible branching point in the DFA where we can * choose some higher outgoing edge character value and still match the DFA. If the node * has a wildcard edge, we can bump the input char by one and generate the smallest diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h index 4ad203c9a46..a15f321f32d 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.h @@ -46,12 +46,13 @@ public: private: const std::vector<Lookup> _lookup; const bool _is_cased; + const bool _is_prefix; static std::vector<Lookup> make_lookup(const std::vector<uint32_t> &str); public: using MatchResult = LevenshteinDfa::MatchResult; - TableDfa(std::vector<uint32_t> str, bool is_cased); + TableDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix); ~TableDfa() override; [[nodiscard]] MatchResult match(std::string_view source) const override; [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const override; diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp index e7c721f644e..721cfe296d2 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp @@ -353,11 +353,13 @@ struct TableMatcher { const TableDfa<N>::Lookup *lookup; const uint32_t end; const bool cased; + const bool prefix; - TableMatcher(const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in) - noexcept : lookup(lookup_in), end(end_in), cased(cased_in) {} + TableMatcher(const TableDfa<N>::Lookup *lookup_in, uint32_t end_in, bool cased_in, bool prefix_in) + noexcept : lookup(lookup_in), end(end_in), cased(cased_in), prefix(prefix_in) {} bool is_cased() const noexcept { return cased; } + bool is_prefix() const noexcept { return prefix; } static constexpr S start() noexcept { return S::start(); } uint8_t match_edit_distance(S s) const noexcept { return s.edits(end); } @@ -473,9 +475,10 @@ TableDfa<N>::make_lookup(const std::vector<uint32_t> &str)->std::vector<Lookup> } template <uint8_t N> -TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased) +TableDfa<N>::TableDfa(std::vector<uint32_t> str, bool is_cased, bool is_prefix) : _lookup(make_lookup(str)), - _is_cased(is_cased) + _is_cased(is_cased), + _is_prefix(is_prefix) { } @@ -486,7 +489,7 @@ template <uint8_t N> LevenshteinDfa::MatchResult TableDfa<N>::match(std::string_view u8str) const { - TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix); return MatchAlgorithm<N>::match(matcher, u8str); } @@ -494,7 +497,7 @@ template <uint8_t N> LevenshteinDfa::MatchResult TableDfa<N>::match(std::string_view u8str, std::string& successor_out) const { - TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix); return MatchAlgorithm<N>::match(matcher, u8str, successor_out); } @@ -502,7 +505,7 @@ template <uint8_t N> LevenshteinDfa::MatchResult TableDfa<N>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const { - TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased); + TableMatcher<N> matcher(_lookup.data(), _lookup.size() - 1, _is_cased, _is_prefix); return MatchAlgorithm<N>::match(matcher, u8str, successor_out); } |