Optimize successor generation of exact match suffix

Avoids explicitly stepping the DFA states and handling each transition character separately by detecting the case where the only way the generated suffix can possibly match is for it to _exactly_ match the remaining target string suffix. This is the case when the sparse cost matrix row only has 1 column remaining, and this column is equal to the max edit distance. I.e. no more edits can possibly be done. In local synthetic benchmarks this speeds up successor generation 4x when the target string is 64 characters.
author: Tor Brede Vekterli <vekterli@yahooinc.com> 2023-09-15 15:01:53 +0000
committer: Tor Brede Vekterli <vekterli@yahooinc.com> 2023-09-18 13:21:04 +0000
commit: 15e0836eade4614e944a0946b327305e5c8966cf (patch)
tree: 8f59bc1075739f093065d4b90ef17725819617fc
parent: b3831f1e2ef267c6eab26b1fa865347cc36b1dcc (diff)
8 files changed, 93 insertions, 16 deletions
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
index 405371462ab..4ff689b1e01 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
@@ -8,7 +8,7 @@ namespace vespalib::fuzzy {
 
 // Concept that all DFA matcher implementations must satisfy
 template <typename T>
-concept DfaMatcher = requires(T a) {
+concept DfaMatcher = requires(T a, std::string u8str) {
     typename T::StateType;
     typename T::StateParamType;
     typename T::EdgeType;
@@ -44,7 +44,7 @@ concept DfaMatcher = requires(T a) {
     // match any character in the target string (i.e. is always a mismatch).
     { a.match_wildcard(typename T::StateType{}) } -> std::same_as<typename T::StateType>;
 
-    // Whether there is exists an out edge from the given state that can accept a
+    // Whether there exists an out edge from the given state that can accept a
     // _higher_ UTF-32 code point value (character) than the input u32 value. Such an
     // edge _may_ be a wildcard edge, which accepts any character.
     { a.has_higher_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>;
@@ -70,6 +70,21 @@ concept DfaMatcher = requires(T a) {
 
     // Returns the state that is the result of following the given edge from the given state.
     { a.edge_to_state(typename T::StateType{}, typename T::EdgeType{}) } -> std::same_as<typename T::StateType>;
+
+    // Returns true iff the only way for the remaining input string to match the target string
+    // is for each subsequent character to match exactly. More precisely, this means that it is
+    // not possible to perform any more edits on the string. This will be the case if the
+    // current row of the Levenshtein matrix contains only 1 entry <= max_edits, and its cost
+    // is equal to max_edits.
+    // It is OK for an implementation to always return false. In this case, a slower code path
+    // (per-state stepping and character output) will be used for emitting the suffix.
+    { a.implies_exact_match_suffix(typename T::StateType{}) } -> std::same_as<bool>;
+
+    // Verbatim emit a suffix of the target string that will turn the prefix represented
+    // by the input state concatenated with the suffix into a matching string.
+    // Precondition: implies_match_suffix(state) == true, i.e. the state is guaranteed to
+    //               afford no edits anywhere.
+    { a.emit_exact_match_suffix(typename T::StateType{}, u8str) } -> std::same_as<void>;
 };
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
index 16b1021de7b..8ec05199bbe 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -79,6 +79,14 @@ struct ExplicitDfaMatcher {
     StateType edge_to_state([[maybe_unused]] StateType node, EdgeType edge) const noexcept {
         return &_nodes[edge->node];
     }
+    constexpr bool implies_exact_match_suffix(const StateType&) const noexcept {
+        // TODO
+        return false;
+    }
+    void emit_exact_match_suffix(const StateType&, std::string&) const {
+        // TODO (will never be called as long as `implies_exact_match_suffix()` returns false)
+        abort();
+    }
 };
 
 template <uint8_t MaxEdits>
@@ -101,7 +109,7 @@ void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) co
         }
         for (const auto& edge : node.match_out_edges()) {
             std::string as_utf8;
-            append_utf32_char_as_utf8(as_utf8, edge.u32ch);
+            append_utf32_char(as_utf8, edge.u32ch);
             os << "    " << i << " -> " << edge.node << " [label=\"" << as_utf8 << "\"];\n";
         }
         if (node.wildcard_edge_to != DOOMED) {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
index 8b9d2eddcac..7c77f7a0ed2 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
@@ -1,8 +1,22 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "implicit_levenshtein_dfa.hpp"
+#include "unicode_utils.h"
+#include <cassert>
 
 namespace vespalib::fuzzy {
 
+template <typename Traits>
+void ImplicitLevenshteinDfa<Traits>::precompute_utf8_target_with_offsets() {
+    _target_utf8_char_offsets.reserve(_u32_str_buf.size());
+    // Important: the precomputed UTF-8 target is based on the potentially case-normalized
+    // target string, ensuring that we don't emit raw target chars for uncased successors.
+    for (uint32_t u32ch : _u32_str_buf) {
+        _target_utf8_char_offsets.emplace_back(static_cast<uint32_t>(_target_as_utf8.size()));
+        append_utf32_char(_target_as_utf8, u32ch);
+    }
+    assert(_target_as_utf8.size() < UINT32_MAX);
+}
+
 template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>;
 template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>;
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
index 8ef521ba14f..0b8ddc71ed4 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -10,14 +10,20 @@ namespace vespalib::fuzzy {
 template <typename Traits>
 class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl {
     const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+    std::string                 _target_as_utf8;
+    std::vector<uint32_t>       _target_utf8_char_offsets;
     const bool                  _is_cased;
 public:
     using MatchResult = LevenshteinDfa::MatchResult;
 
-    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased) noexcept
+    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased)
         : _u32_str_buf(std::move(str)),
+          _target_as_utf8(),
+          _target_utf8_char_offsets(),
           _is_cased(is_cased)
-    {}
+    {
+        precompute_utf8_target_with_offsets();
+    }
 
     ~ImplicitLevenshteinDfa() override = default;
 
@@ -28,6 +34,8 @@ public:
     }
 
     void dump_as_graphviz(std::ostream& os) const override;
+private:
+    void precompute_utf8_target_with_offsets();
 };
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
index e962dba0f18..24c63931661 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -29,10 +29,17 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
     using Base::is_match;
     using Base::can_match;
 
-    const bool _is_cased;
-
-    ImplicitDfaMatcher(std::span<const uint32_t> u32_str, bool is_cased) noexcept
+    std::span<const char>     _target_as_utf8;
+    std::span<const uint32_t> _target_utf8_char_offsets;
+    const bool                _is_cased;
+
+    ImplicitDfaMatcher(std::span<const uint32_t> u32_str,
+                       std::span<const char>     target_as_utf8,
+                       std::span<const uint32_t> target_utf8_char_offsets,
+                       bool is_cased) noexcept
         : Base(u32_str),
+          _target_as_utf8(target_as_utf8),
+          _target_utf8_char_offsets(target_utf8_char_offsets),
           _is_cased(is_cased)
     {}
 
@@ -109,12 +116,22 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
     StateType edge_to_state(const StateType& state, EdgeType edge) const noexcept {
         return step(state, edge);
     }
+    bool implies_exact_match_suffix(const StateType& state) const noexcept {
+        // Only one entry in the sparse matrix row and it implies no edits can be done
+        return (state.size() == 1 && state.cost(0) == max_edits());
+    }
+    // Precondition: implies_match_suffix(state)
+    void emit_exact_match_suffix(const StateType& state, std::string& u8_out) const {
+        const uint32_t target_u8_offset = _target_utf8_char_offsets[state.index(0)];
+        u8_out.append(_target_as_utf8.data() + target_u8_offset, _target_as_utf8.size() - target_u8_offset);
+    }
+    // TODO emit_exact_match_suffix(const StateType& state, std::u32string& u32_out)
 };
 
 template <typename Traits>
 LevenshteinDfa::MatchResult
 ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _is_cased);
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
 }
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index d9c4e0ffe36..ff1835e7bf1 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -235,14 +235,14 @@ struct MatchAlgorithm {
             // instead of the wildcard edge, or we'll end up in the wrong state.
             const auto next_char = input_at_branch + 1;
             if (!matcher.has_exact_explicit_out_edge(last_state_with_higher_out, next_char)) {
-                append_utf32_char_as_utf8(successor, next_char);
+                append_utf32_char(successor, next_char);
                 emit_smallest_matching_suffix(matcher, wildcard_state, successor);
                 return;
             } // else: handle exact match below (it will be found as the first higher out edge)
         }
         const auto first_highest_edge = matcher.lowest_higher_explicit_out_edge(last_state_with_higher_out, input_at_branch);
         assert(matcher.valid_edge(first_highest_edge));
-        append_utf32_char_as_utf8(successor, matcher.edge_to_u32char(first_highest_edge));
+        append_utf32_char(successor, matcher.edge_to_u32char(first_highest_edge));
         emit_smallest_matching_suffix(matcher, matcher.edge_to_state(last_state_with_higher_out, first_highest_edge), successor);
     }
 
@@ -285,6 +285,13 @@ struct MatchAlgorithm {
     {
         auto state = from;
         while (!matcher.is_match(state)) {
+            // Optimization: if the only way for the remaining suffix to match is for it to be
+            // exactly equal to the suffix of the target string, emit it directly instead of
+            // stepping the state per character. This allows for matcher-specific shortcuts.
+            if (matcher.implies_exact_match_suffix(state)) {
+                matcher.emit_exact_match_suffix(state, str);
+                return;
+            }
             // If we can take a wildcard path, emit the smallest possible valid UTF-8 character (0x01).
             // Otherwise, find the smallest char that can eventually lead us to a match.
             auto wildcard_state = matcher.match_wildcard(state);
@@ -294,7 +301,7 @@ struct MatchAlgorithm {
             } else {
                 const auto smallest_out_edge = matcher.smallest_explicit_out_edge(state);
                 assert(matcher.valid_edge(smallest_out_edge));
-                append_utf32_char_as_utf8(str, matcher.edge_to_u32char(smallest_out_edge));
+                append_utf32_char(str, matcher.edge_to_u32char(smallest_out_edge));
                 state = matcher.edge_to_state(state, smallest_out_edge);
             }
         }
@@ -328,7 +335,7 @@ struct MatchAlgorithm {
             successor_out.clear();
             Utf8Reader u8_reader(source.data(), source.size());
             while (u8_reader.getPos() < n_prefix_u8_bytes) {
-                append_utf32_char_as_utf8(successor_out, LowerCase::convert(u8_reader.getChar()));
+                append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar()));
             }
         }
     }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
index f2c6259fd10..f72518cc68f 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
@@ -54,7 +54,7 @@ namespace {
  *
  * Returns the number of bytes written.
  *
- * See comments on append_utf32_char_as_utf8() as to why this is not a generic UTF-8
+ * See comments on append_utf32_char() as to why this is not a generic UTF-8
  * encoding function that can be used in all possible scenarios.
  */
 [[nodiscard]] uint8_t encode_utf8_char(uint32_t codepoint, unsigned char* u8buf) {
@@ -118,7 +118,7 @@ namespace {
 } // anon ns
 
 // TODO optimize inlined in header for case where u32_char is < 0x80?
-void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char) {
+void append_utf32_char(std::string& out_str, uint32_t u32_char) {
     unsigned char u8buf[4];
     uint8_t u8bytes = encode_utf8_char(u32_char, u8buf);
     out_str.append(reinterpret_cast<const char*>(u8buf), u8bytes);
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
index 711c4f1fcd0..a2e9e232555 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
@@ -31,6 +31,14 @@ std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str);
  * ... So don't copy this function for use as a general UTF-8 emitter, as it is not
  * _technically_ conformant!
  */
-void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char);
+void append_utf32_char(std::string& out_str, uint32_t u32_char);
+
+inline void append_utf32_char(std::u32string& out_str, uint32_t u32_char) {
+    out_str.push_back(u32_char); // no conversion needed
+}
+
+inline void append_utf32_char(std::vector<uint32_t>& out_str, uint32_t u32_char) {
+    out_str.push_back(u32_char); // no conversion needed
+}
 
 }
author	Tor Brede Vekterli <vekterli@yahooinc.com>	2023-09-15 15:01:53 +0000
committer	Tor Brede Vekterli <vekterli@yahooinc.com>	2023-09-18 13:21:04 +0000
commit	15e0836eade4614e944a0946b327305e5c8966cf (patch)
tree	8f59bc1075739f093065d4b90ef17725819617fc
parent	b3831f1e2ef267c6eab26b1fa865347cc36b1dcc (diff)