Merge pull request #28558 from vespa-engine/vekterli/dfa-successor-generation-optimization

Levenshtein DFA successor generation optimization and UTF-32 output
author: Tor Brede Vekterli <vekterli@yahooinc.com> 2023-09-18 16:19:57 +0200
committer: GitHub <noreply@github.com> 2023-09-18 16:19:57 +0200
commit: eede788d1c20d2f246f44287308dad69487369ea (patch)
tree: 71406c55b9310a79bb60f93e57314729001c8500
parent: 5132cfd082fbc0ef2f0c7c4a760925b613cc975e (diff)
parent: 4a004d2e8d472c660cc9c2b73ee637c5247968da (diff)
13 files changed, 249 insertions, 54 deletions
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
index 072eb5e1333..6b873020994 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -25,7 +25,7 @@ public:
 
     template <typename DictionaryConstIteratorType>
     bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) {
-        auto match = _dfa.match(word, &_successor);
+        auto match = _dfa.match(word, _successor);
         if (match.matches()) {
             return true;
         } else {
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index c466d0bfb3e..c235cb99509 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -26,10 +26,10 @@ std::optional<uint32_t> do_calculate(std::string_view left, std::string_view rig
                                      LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
 {
     auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type);
-    auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
+    auto maybe_match_lhs = dfa_lhs.match(right);
 
     auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type);
-    auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
+    auto maybe_match_rhs = dfa_rhs.match(left);
 
     EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
     if (maybe_match_lhs.matches()) {
@@ -47,6 +47,11 @@ std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view
     return do_calculate(lhs_ch, rhs_ch, threshold, casing, dfa_type);
 }
 
+void expect_utf32_string_code_point_equal_to_utf8(std::span<const uint32_t> u32str, const std::string& u8str) {
+    auto as_utf8 = utf32_string_to_utf8(u32str);
+    EXPECT_EQ(as_utf8, u8str);
+}
+
 }
 
 struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> {
@@ -119,14 +124,20 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
 
 void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) {
     std::string successor;
-    auto m = dfa.match(source, &successor);
+    auto m = dfa.match(source, successor);
     if (m.matches()) {
         FAIL() << "Expected '" << source << "' to emit a successor, but it "
                << "matched with " << static_cast<uint32_t>(m.edits())
                << " edits (of max " << static_cast<uint32_t>(m.max_edits()) <<  " edits)";
     }
     EXPECT_EQ(successor, expected_successor);
-    EXPECT_TRUE(dfa.match(successor, nullptr).matches());
+    EXPECT_TRUE(dfa.match(successor).matches());
+
+    // Make sure the UTF-32 successor output is codepoint-wise identical to the UTF-8 successor
+    std::vector<uint32_t> u32successor;
+    m = dfa.match(source, u32successor);
+    EXPECT_FALSE(m.matches());
+    expect_utf32_string_code_point_equal_to_utf8(u32successor, successor);
 }
 
 TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) {
@@ -331,7 +342,7 @@ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
         std::string skip_to, successor;
         for (uint32_t j = 0; j < 256; ++j) {
             const auto source = bits_to_str(static_cast<uint8_t>(j));
-            auto maybe_match = target_dfa.match(source, &successor);
+            auto maybe_match = target_dfa.match(source, successor);
             if (maybe_match.matches() && !skip_to.empty()) {
                 ASSERT_GE(source, skip_to);
             } else if (!maybe_match.matches()) {
@@ -494,7 +505,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_t
                                                                      : LevenshteinDfa::DfaType::Implicit;
                 auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
                 min_time_s = BenchmarkTimer::benchmark([&] {
-                    auto res = dfa.match(str, nullptr); // not benchmarking successor generation
+                    auto res = dfa.match(str); // not benchmarking successor generation
                     do_not_optimize_away(res);
                 }, 1.0);
             } else {
@@ -545,7 +556,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
                 auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
                 min_time_s = BenchmarkTimer::benchmark([&] {
                     for (const auto& line : dict) {
-                        auto res = dfa.match(line, nullptr);
+                        auto res = dfa.match(line);
                         do_not_optimize_away(res);
                     }
                 }, 2.0);
@@ -586,7 +597,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) {
                 auto end = dict.cend();
                 std::string successor;
                 while (iter != end) {
-                    auto maybe_match = dfa.match(*iter, &successor);
+                    auto maybe_match = dfa.match(*iter, successor);
                     if (maybe_match.matches()) {
                         ++iter;
                     } else {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
index 405371462ab..6c7e8a35b4e 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
@@ -3,12 +3,14 @@
 
 #include <concepts>
 #include <cstdint>
+#include <string>
+#include <vector>
 
 namespace vespalib::fuzzy {
 
 // Concept that all DFA matcher implementations must satisfy
 template <typename T>
-concept DfaMatcher = requires(T a) {
+concept DfaMatcher = requires(T a, std::string u8str, std::vector<uint32_t> u32str) {
     typename T::StateType;
     typename T::StateParamType;
     typename T::EdgeType;
@@ -44,7 +46,7 @@ concept DfaMatcher = requires(T a) {
     // match any character in the target string (i.e. is always a mismatch).
     { a.match_wildcard(typename T::StateType{}) } -> std::same_as<typename T::StateType>;
 
-    // Whether there is exists an out edge from the given state that can accept a
+    // Whether there exists an out edge from the given state that can accept a
     // _higher_ UTF-32 code point value (character) than the input u32 value. Such an
     // edge _may_ be a wildcard edge, which accepts any character.
     { a.has_higher_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>;
@@ -70,6 +72,24 @@ concept DfaMatcher = requires(T a) {
 
     // Returns the state that is the result of following the given edge from the given state.
     { a.edge_to_state(typename T::StateType{}, typename T::EdgeType{}) } -> std::same_as<typename T::StateType>;
+
+    // Returns true iff the only way for the remaining input string to match the target string
+    // is for each subsequent character to match exactly. More precisely, this means that it is
+    // not possible to perform any more edits on the string. This will be the case if the
+    // current row of the Levenshtein matrix contains only 1 entry <= max_edits, and its cost
+    // is equal to max_edits.
+    // It is OK for an implementation to always return false. In this case, a slower code path
+    // (per-state stepping and character output) will be used for emitting the suffix.
+    { a.implies_exact_match_suffix(typename T::StateType{}) } -> std::same_as<bool>;
+
+    // Verbatim emit a suffix of the target string that will turn the prefix represented
+    // by the input state concatenated with the suffix into a matching string.
+    // Precondition: implies_match_suffix(state) == true, i.e. the state is guaranteed to
+    //               afford no edits anywhere.
+    { a.emit_exact_match_suffix(typename T::StateType{}, u8str) } -> std::same_as<void>;
+
+    // Same as above, but for raw UTF-32 code point output
+    { a.emit_exact_match_suffix(typename T::StateType{}, u32str) } -> std::same_as<void>;
 };
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
index 6e8068a7419..630e07738fb 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -125,11 +125,16 @@ public:
 
     [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
 
+    [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override;
+
     [[nodiscard]] size_t memory_usage() const noexcept override {
         return sizeof(DfaNodeType) * _nodes.size();
     }
 
     void dump_as_graphviz(std::ostream& os) const override;
+private:
+    template <typename SuccessorT>
+    [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const;
 };
 
 template <typename Traits>
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
index 16b1021de7b..7860d841fbf 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -79,16 +79,41 @@ struct ExplicitDfaMatcher {
     StateType edge_to_state([[maybe_unused]] StateType node, EdgeType edge) const noexcept {
         return &_nodes[edge->node];
     }
+    constexpr bool implies_exact_match_suffix(const StateType&) const noexcept {
+        // TODO; caller will currently just fall back to explicit state stepping
+        return false;
+    }
+    void emit_exact_match_suffix(const StateType&, std::string&) const {
+        // TODO (will never be called as long as `implies_exact_match_suffix()` returns false)
+        abort();
+    }
+    void emit_exact_match_suffix(const StateType&, std::vector<uint32_t>&) const {
+        // TODO (will never be called as long as `implies_exact_match_suffix()` returns false)
+        abort();
+    }
 };
 
 template <uint8_t MaxEdits>
+template <typename SuccessorT>
 LevenshteinDfa::MatchResult
-ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
+ExplicitLevenshteinDfaImpl<MaxEdits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const {
     ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
     return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
 }
 
 template <uint8_t MaxEdits>
+LevenshteinDfa::MatchResult
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
+    return match_impl(u8str, successor_out);
+}
+
+template <uint8_t MaxEdits>
+LevenshteinDfa::MatchResult
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const {
+    return match_impl(u8str, successor_out);
+}
+
+template <uint8_t MaxEdits>
 void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) const {
     os << std::dec << "digraph levenshtein_dfa {\n";
     os << "    fontname=\"Helvetica,Arial,sans-serif\"\n";
@@ -101,7 +126,7 @@ void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) co
         }
         for (const auto& edge : node.match_out_edges()) {
             std::string as_utf8;
-            append_utf32_char_as_utf8(as_utf8, edge.u32ch);
+            append_utf32_char(as_utf8, edge.u32ch);
             os << "    " << i << " -> " << edge.node << " [label=\"" << as_utf8 << "\"];\n";
         }
         if (node.wildcard_edge_to != DOOMED) {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
index 8b9d2eddcac..031973fd35d 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
@@ -1,8 +1,34 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "implicit_levenshtein_dfa.hpp"
+#include "unicode_utils.h"
+#include <cassert>
 
 namespace vespalib::fuzzy {
 
+/**
+ * Builds two separate vectors that exist alongside the (possibly case-normalized) UTF-32
+ * target string:
+ *
+ *   - The UTF-8 representation of the (possibly case-normalized) target string
+ *   - An offset vector that maps each UTF-32 string index to the first byte of the
+ *     equivalent UTF-8 character.
+ *
+ * These are used for efficiently dumping an UTF-8 target string suffix from an UTF-32
+ * target string index.
+ */
+template <typename Traits>
+void ImplicitLevenshteinDfa<Traits>::precompute_utf8_target_with_offsets() {
+    _target_utf8_char_offsets.reserve(_u32_str_buf.size());
+    _target_as_utf8.reserve(_u32_str_buf.size()); // Under-reserves for all non-ASCII (TODO count via simdutf?)
+    // Important: the precomputed UTF-8 target is based on the potentially case-normalized
+    // target string, ensuring that we don't emit raw target chars for uncased successors.
+    for (uint32_t u32ch : _u32_str_buf) {
+        _target_utf8_char_offsets.emplace_back(static_cast<uint32_t>(_target_as_utf8.size()));
+        append_utf32_char(_target_as_utf8, u32ch);
+    }
+    assert(_target_as_utf8.size() < UINT32_MAX);
+}
+
 template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>;
 template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>;
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
index 8ef521ba14f..bb4a0918593 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -10,24 +10,37 @@ namespace vespalib::fuzzy {
 template <typename Traits>
 class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl {
     const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+    std::string                 _target_as_utf8;
+    std::vector<uint32_t>       _target_utf8_char_offsets;
     const bool                  _is_cased;
 public:
     using MatchResult = LevenshteinDfa::MatchResult;
 
-    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased) noexcept
+    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased)
         : _u32_str_buf(std::move(str)),
+          _target_as_utf8(),
+          _target_utf8_char_offsets(),
           _is_cased(is_cased)
-    {}
+    {
+        precompute_utf8_target_with_offsets();
+    }
 
     ~ImplicitLevenshteinDfa() override = default;
 
     [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
 
+    [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override;
+
     [[nodiscard]] size_t memory_usage() const noexcept override {
         return _u32_str_buf.size() * sizeof(uint32_t);
     }
 
     void dump_as_graphviz(std::ostream& os) const override;
+private:
+    template <typename SuccessorT>
+    [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const;
+
+    void precompute_utf8_target_with_offsets();
 };
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
index e962dba0f18..25fd3fdcc4e 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -29,10 +29,17 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
     using Base::is_match;
     using Base::can_match;
 
-    const bool _is_cased;
-
-    ImplicitDfaMatcher(std::span<const uint32_t> u32_str, bool is_cased) noexcept
+    std::span<const char>     _target_as_utf8;
+    std::span<const uint32_t> _target_utf8_char_offsets;
+    const bool                _is_cased;
+
+    ImplicitDfaMatcher(std::span<const uint32_t> u32_str,
+                       std::span<const char>     target_as_utf8,
+                       std::span<const uint32_t> target_utf8_char_offsets,
+                       bool is_cased) noexcept
         : Base(u32_str),
+          _target_as_utf8(target_as_utf8),
+          _target_utf8_char_offsets(target_utf8_char_offsets),
           _is_cased(is_cased)
     {}
 
@@ -109,16 +116,45 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
     StateType edge_to_state(const StateType& state, EdgeType edge) const noexcept {
         return step(state, edge);
     }
+    bool implies_exact_match_suffix(const StateType& state) const noexcept {
+        // Only one entry in the sparse matrix row and it implies no edits can be done.
+        // I.e. only way to match the target string suffix is to match it _exactly_.
+        return (state.size() == 1 && state.cost(0) == max_edits());
+    }
+    // Precondition: implies_match_suffix(state)
+    void emit_exact_match_suffix(const StateType& state, std::string& u8_out) const {
+        const uint32_t target_u8_offset = _target_utf8_char_offsets[state.index(0)];
+        u8_out.append(_target_as_utf8.data() + target_u8_offset, _target_as_utf8.size() - target_u8_offset);
+    }
+    void emit_exact_match_suffix(const StateType& state, std::vector<uint32_t>& u32_out) const {
+        // TODO ranged insert
+        for (uint32_t i = state.index(0); i < _u32_str.size(); ++i) {
+            u32_out.push_back(_u32_str[i]);
+        }
+    }
 };
 
 template <typename Traits>
+template <typename SuccessorT>
 LevenshteinDfa::MatchResult
-ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _is_cased);
+ImplicitLevenshteinDfa<Traits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const {
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
 }
 
 template <typename Traits>
+LevenshteinDfa::MatchResult
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
+    return match_impl(u8str, successor_out);
+}
+
+template <typename Traits>
+LevenshteinDfa::MatchResult
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const {
+    return match_impl(u8str, successor_out);
+}
+
+template <typename Traits>
 void ImplicitLevenshteinDfa<Traits>::dump_as_graphviz(std::ostream&) const {
     throw std::runtime_error("Graphviz output not available for implicit Levenshtein DFA");
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index 77691ffefd8..6e38821851b 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -18,8 +18,18 @@ LevenshteinDfa& LevenshteinDfa::operator=(LevenshteinDfa&&) noexcept = default;
 LevenshteinDfa::~LevenshteinDfa() = default;
 
 LevenshteinDfa::MatchResult
-LevenshteinDfa::match(std::string_view u8str, std::string* successor_out) const {
-    return _impl->match(u8str, successor_out);
+LevenshteinDfa::match(std::string_view u8str) const {
+    return _impl->match(u8str, static_cast<std::vector<uint32_t>*>(nullptr)); // TODO rewire
+}
+
+LevenshteinDfa::MatchResult
+LevenshteinDfa::match(std::string_view u8str, std::string& successor_out) const {
+    return _impl->match(u8str, &successor_out);
+}
+
+LevenshteinDfa::MatchResult
+LevenshteinDfa::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
+    return _impl->match(u8str, &successor_out);
 }
 
 size_t LevenshteinDfa::memory_usage() const noexcept {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index a8b2b77f647..85ad98e2a09 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -6,6 +6,7 @@
 #include <memory>
 #include <string>
 #include <string_view>
+#include <vector>
 
 namespace vespalib::fuzzy {
 
@@ -140,6 +141,7 @@ public:
     struct Impl {
         virtual ~Impl() = default;
         [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string* successor_out) const = 0;
+        [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const = 0;
         [[nodiscard]] virtual size_t memory_usage() const noexcept = 0;
         virtual void dump_as_graphviz(std::ostream& out) const = 0;
     };
@@ -169,7 +171,17 @@ public:
      * Iff `source` is _beyond_ the maximum edit distance, returns a MatchResult with
      * matches() == false.
      *
-     * Iff `successor_out` is not nullptr, the following holds:
+     */
+    [[nodiscard]] MatchResult match(std::string_view source) const;
+
+    /**
+     * Attempts to match the source string `source` with the target string this DFA was
+     * built with, emitting a successor string into `successor_out` on mismatch.
+     *
+     * See `match(source)` for semantics of returned MatchResult.
+     *
+     * In the case of a _mismatch_, the following holds:
+     *
      *   - `successor_out` is modified to contain the next (in byte-wise ordering) possible
      *     _matching_ string S so that there exists no other matching string S' that is
      *     greater than `source` but smaller than S.
@@ -199,7 +211,17 @@ public:
      * By reusing the successor string across many calls, this therefore amortizes memory
      * allocations down to near zero per invocation.
      */
-    [[nodiscard]] MatchResult match(std::string_view source, std::string* successor_out) const;
+    [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const;
+
+    /**
+     * Same as `match(source, successor_out)`, but where the successor string is defined in
+     * terms of UTF-32, not UTF-8. This avoids the need for encoding characters to UTF-8
+     * internally, and is therefore expected to be more efficient.
+     *
+     * The code point ordering of the UTF-32 successor string is identical to that its UTF-8
+     * equivalent.
+     */
+    [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const;
 
     /**
      * Returns how much memory is used by the underlying DFA representation, in bytes.
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index d9c4e0ffe36..2b3c06aa7cf 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -143,21 +143,13 @@ struct MatchAlgorithm {
      * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used
      * when one or more of the prefix characters have been lowercase-transformed.
      *
-     * TODO we could probably also optimize the smallest suffix generation with this when
-     *   we know we can no longer insert any smaller char substitutions and the only way
-     *   to complete the string is to emit it verbatim.
-     *    - To do this we'd need both the original UTF-8 target string as well as a
-     *      secondary vector that maps u32 character index to the corresponding UTF-8 index.
-     *      Both trivial to get as part of DFA initialization.
-     *
      * TODO let matcher know if source string is pre-normalized (i.e. lowercased).
-     * TODO std::u32string output; no need to re-encode to UTF-8.
      * TODO consider opportunistically appending prefix as we go instead of only when needed.
      */
-    template <DfaMatcher Matcher>
+    template <DfaMatcher Matcher, typename SuccessorT>
     static MatchResult match(const Matcher& matcher,
                              std::string_view source,
-                             std::string* successor_out)
+                             SuccessorT* successor_out)
     {
         using StateType = typename Matcher::StateType;
         Utf8Reader u8_reader(source.data(), source.size());
@@ -217,12 +209,12 @@ struct MatchAlgorithm {
      * precondition: `last_node_with_higher_out` has either a wildcard edge or a char match
      *    edge that compares greater than `input_at_branch`.
      */
-    template <DfaMatcher Matcher>
+    template <DfaMatcher Matcher, typename SuccessorT>
     static void backtrack_and_emit_greater_suffix(
             const Matcher& matcher,
             typename Matcher::StateParamType last_state_with_higher_out,
             const uint32_t input_at_branch,
-            std::string& successor)
+            SuccessorT& successor)
     {
         auto wildcard_state = matcher.match_wildcard(last_state_with_higher_out);
         if (matcher.can_match(wildcard_state)) {
@@ -235,14 +227,14 @@ struct MatchAlgorithm {
             // instead of the wildcard edge, or we'll end up in the wrong state.
             const auto next_char = input_at_branch + 1;
             if (!matcher.has_exact_explicit_out_edge(last_state_with_higher_out, next_char)) {
-                append_utf32_char_as_utf8(successor, next_char);
+                append_utf32_char(successor, next_char);
                 emit_smallest_matching_suffix(matcher, wildcard_state, successor);
                 return;
             } // else: handle exact match below (it will be found as the first higher out edge)
         }
         const auto first_highest_edge = matcher.lowest_higher_explicit_out_edge(last_state_with_higher_out, input_at_branch);
         assert(matcher.valid_edge(first_highest_edge));
-        append_utf32_char_as_utf8(successor, matcher.edge_to_u32char(first_highest_edge));
+        append_utf32_char(successor, matcher.edge_to_u32char(first_highest_edge));
         emit_smallest_matching_suffix(matcher, matcher.edge_to_state(last_state_with_higher_out, first_highest_edge), successor);
     }
 
@@ -277,29 +269,41 @@ struct MatchAlgorithm {
      */
      // TODO consider variant for only emitting _prefix of suffix_ to avoid having to generate
      //  the full string? Won't generate a matching string, but will be lexicographically greater.
-    template <DfaMatcher Matcher>
+    template <DfaMatcher Matcher, typename SuccessorT>
     static void emit_smallest_matching_suffix(
             const Matcher& matcher,
             typename Matcher::StateParamType from,
-            std::string& str)
+            SuccessorT& str)
     {
         auto state = from;
         while (!matcher.is_match(state)) {
+            // Optimization: if the only way for the remaining suffix to match is for it to be
+            // exactly equal to the suffix of the target string, emit it directly instead of
+            // stepping the state per character. This allows for matcher-specific shortcuts.
+            if (matcher.implies_exact_match_suffix(state)) {
+                matcher.emit_exact_match_suffix(state, str);
+                return;
+            }
             // If we can take a wildcard path, emit the smallest possible valid UTF-8 character (0x01).
             // Otherwise, find the smallest char that can eventually lead us to a match.
             auto wildcard_state = matcher.match_wildcard(state);
             if (matcher.can_match(wildcard_state)) {
-                str += '\x01';
+                str.push_back(0x01);
                 state = wildcard_state;
             } else {
                 const auto smallest_out_edge = matcher.smallest_explicit_out_edge(state);
                 assert(matcher.valid_edge(smallest_out_edge));
-                append_utf32_char_as_utf8(str, matcher.edge_to_u32char(smallest_out_edge));
+                append_utf32_char(str, matcher.edge_to_u32char(smallest_out_edge));
                 state = matcher.edge_to_state(state, smallest_out_edge);
             }
         }
     }
 
+    template <typename T>
+    static constexpr bool has_8bit_value_type() noexcept {
+        return sizeof(typename T::value_type) == 1;
+    }
+
     /**
      * The successor prefix is the prefix of the source string up to (but not including) the
      * point where we emit a lexicographically higher character. Ideally we can just copy the
@@ -318,19 +322,23 @@ struct MatchAlgorithm {
      * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
      * ordering when compared byte-wise against an implicitly lowercased dictionary.
      */
-    static void emit_successor_prefix(std::string& successor_out, std::string_view source,
+    template <typename SuccessorT>
+    static void emit_successor_prefix(SuccessorT& successor_out, std::string_view source,
                                       uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes)
     {
-        if (emit_raw_prefix_u8_bytes) {
-            successor_out = source.substr(0, n_prefix_u8_bytes);
-        } else {
-            // TODO avoid duplicate work...! :I
-            successor_out.clear();
-            Utf8Reader u8_reader(source.data(), source.size());
-            while (u8_reader.getPos() < n_prefix_u8_bytes) {
-                append_utf32_char_as_utf8(successor_out, LowerCase::convert(u8_reader.getChar()));
+        // TODO redesign prefix output wiring
+        if constexpr (has_8bit_value_type<SuccessorT>()) {
+            if (emit_raw_prefix_u8_bytes) {
+                successor_out = source.substr(0, n_prefix_u8_bytes);
+                return;
             }
         }
+        // TODO avoid duplicate work...! :I
+        successor_out.clear();
+        Utf8Reader u8_reader(source.data(), source.size());
+        while (u8_reader.getPos() < n_prefix_u8_bytes) {
+            append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar()));
+        }
     }
 
     static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
index f2c6259fd10..043618db3b5 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
@@ -40,6 +40,14 @@ std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str) {
     return utf8_string_to_utf32(std::string_view(reinterpret_cast<const char*>(u8str.data()), u8str.size()));
 }
 
+std::string utf32_string_to_utf8(std::span<const uint32_t> u32str) {
+    std::string u8out;
+    for (uint32_t u32ch : u32str) {
+        append_utf32_char(u8out, u32ch);
+    }
+    return u8out;
+}
+
 [[noreturn]] void throw_bad_code_point(uint32_t codepoint) __attribute__((noinline));
 [[noreturn]] void throw_bad_code_point(uint32_t codepoint) {
     throw std::invalid_argument(make_string("invalid UTF-32 codepoint: U+%04X (%u)", codepoint, codepoint));
@@ -54,7 +62,7 @@ namespace {
  *
  * Returns the number of bytes written.
  *
- * See comments on append_utf32_char_as_utf8() as to why this is not a generic UTF-8
+ * See comments on append_utf32_char() as to why this is not a generic UTF-8
  * encoding function that can be used in all possible scenarios.
  */
 [[nodiscard]] uint8_t encode_utf8_char(uint32_t codepoint, unsigned char* u8buf) {
@@ -118,7 +126,7 @@ namespace {
 } // anon ns
 
 // TODO optimize inlined in header for case where u32_char is < 0x80?
-void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char) {
+void append_utf32_char(std::string& out_str, uint32_t u32_char) {
     unsigned char u8buf[4];
     uint8_t u8bytes = encode_utf8_char(u32_char, u8buf);
     out_str.append(reinterpret_cast<const char*>(u8buf), u8bytes);
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
index 711c4f1fcd0..7af0eb6e5e3 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
@@ -2,6 +2,7 @@
 #pragma once
 
 #include <cstdint>
+#include <span>
 #include <string>
 #include <string_view>
 #include <vector>
@@ -15,6 +16,8 @@ std::vector<uint32_t> utf8_string_to_utf32(std::string_view str);
 
 std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str);
 
+std::string utf32_string_to_utf8(std::span<const uint32_t> u32str);
+
 /**
  * Encodes a single UTF-32 codepoint `u32_char` to a 1-4 byte UTF-8 sequence and
  * appends it to `out_str.`
@@ -31,6 +34,14 @@ std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str);
  * ... So don't copy this function for use as a general UTF-8 emitter, as it is not
  * _technically_ conformant!
  */
-void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char);
+void append_utf32_char(std::string& out_str, uint32_t u32_char);
+
+inline void append_utf32_char(std::u32string& out_str, uint32_t u32_char) {
+    out_str.push_back(u32_char); // no conversion needed
+}
+
+inline void append_utf32_char(std::vector<uint32_t>& out_str, uint32_t u32_char) {
+    out_str.push_back(u32_char); // no conversion needed
+}
 
 }
author	Tor Brede Vekterli <vekterli@yahooinc.com>	2023-09-18 16:19:57 +0200
committer	GitHub <noreply@github.com>	2023-09-18 16:19:57 +0200
commit	eede788d1c20d2f246f44287308dad69487369ea (patch)
tree	71406c55b9310a79bb60f93e57314729001c8500
parent	5132cfd082fbc0ef2f0c7c4a760925b613cc975e (diff)
parent	4a004d2e8d472c660cc9c2b73ee637c5247968da (diff)