diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-09-21 09:51:36 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-09-21 09:54:42 +0000 |
commit | ec0d4b0ec140bb3313fcdb27f0b2745530303b41 (patch) | |
tree | 45f22ea1527760dde54bb6dc7e2152175e335229 /vespalib | |
parent | 7faeffcc5901ae88c1c3d1814665d0db6ca1d900 (diff) |
Split core DFA match loop into match-only and successor-emitting specializations
This allows for "hybrid" schemes where raw matching (without
successor generation) is done with a dedicated matcher implementation
that is faster for that particular purpose.
Also gives a much tighter loop for the match-only case and
removes some branches from the successor-emitting case.
Diffstat (limited to 'vespalib')
7 files changed, 65 insertions, 43 deletions
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h index 630e07738fb..b7ac35b9d19 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h @@ -123,18 +123,17 @@ public: _nodes[from_node_idx].set_wildcard_out_edge(to_node_idx); } - [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override; + [[nodiscard]] MatchResult match(std::string_view u8str) const override; - [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override; + [[nodiscard]] MatchResult match(std::string_view u8str, std::string& successor_out) const override; + + [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>& successor_out) const override; [[nodiscard]] size_t memory_usage() const noexcept override { return sizeof(DfaNodeType) * _nodes.size(); } void dump_as_graphviz(std::ostream& os) const override; -private: - template <typename SuccessorT> - [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const; }; template <typename Traits> diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp index 7860d841fbf..0a371d3b277 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp @@ -94,23 +94,24 @@ struct ExplicitDfaMatcher { }; template <uint8_t MaxEdits> -template <typename SuccessorT> LevenshteinDfa::MatchResult -ExplicitLevenshteinDfaImpl<MaxEdits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const { +ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str) const { ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased); - return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out); + return MatchAlgorithm<MaxEdits>::match(matcher, u8str); } template <uint8_t MaxEdits> LevenshteinDfa::MatchResult -ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const { - return match_impl(u8str, successor_out); +ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string& successor_out) const { + ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased); + return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out); } template <uint8_t MaxEdits> LevenshteinDfa::MatchResult -ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const { - return match_impl(u8str, successor_out); +ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const { + ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased); + return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out); } template <uint8_t MaxEdits> diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h index bb4a0918593..e12d8c3dedb 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h @@ -27,9 +27,11 @@ public: ~ImplicitLevenshteinDfa() override = default; - [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override; + [[nodiscard]] MatchResult match(std::string_view u8str) const override; - [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override; + [[nodiscard]] MatchResult match(std::string_view u8str, std::string& successor_out) const override; + + [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>& successor_out) const override; [[nodiscard]] size_t memory_usage() const noexcept override { return _u32_str_buf.size() * sizeof(uint32_t); @@ -37,9 +39,6 @@ public: void dump_as_graphviz(std::ostream& os) const override; private: - template <typename SuccessorT> - [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const; - void precompute_utf8_target_with_offsets(); }; diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp index 25fd3fdcc4e..ff381b7ba7c 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp @@ -135,23 +135,24 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> { }; template <typename Traits> -template <typename SuccessorT> LevenshteinDfa::MatchResult -ImplicitLevenshteinDfa<Traits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const { +ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str) const { ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); - return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out); + return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str); } template <typename Traits> LevenshteinDfa::MatchResult -ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const { - return match_impl(u8str, successor_out); +ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string& successor_out) const { + ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); + return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out); } template <typename Traits> LevenshteinDfa::MatchResult -ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const { - return match_impl(u8str, successor_out); +ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const { + ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); + return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out); } template <typename Traits> diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp index 6e38821851b..1caae408176 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp @@ -19,17 +19,17 @@ LevenshteinDfa::~LevenshteinDfa() = default; LevenshteinDfa::MatchResult LevenshteinDfa::match(std::string_view u8str) const { - return _impl->match(u8str, static_cast<std::vector<uint32_t>*>(nullptr)); // TODO rewire + return _impl->match(u8str); } LevenshteinDfa::MatchResult LevenshteinDfa::match(std::string_view u8str, std::string& successor_out) const { - return _impl->match(u8str, &successor_out); + return _impl->match(u8str, successor_out); } LevenshteinDfa::MatchResult LevenshteinDfa::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const { - return _impl->match(u8str, &successor_out); + return _impl->match(u8str, successor_out); } size_t LevenshteinDfa::memory_usage() const noexcept { diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h index 85ad98e2a09..c6ca06d4de3 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -140,8 +140,9 @@ public: struct Impl { virtual ~Impl() = default; - [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string* successor_out) const = 0; - [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const = 0; + [[nodiscard]] virtual MatchResult match(std::string_view u8str) const = 0; + [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string& successor_out) const = 0; + [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::vector<uint32_t>& successor_out) const = 0; [[nodiscard]] virtual size_t memory_usage() const noexcept = 0; virtual void dump_as_graphviz(std::ostream& out) const = 0; }; diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp index 2b3c06aa7cf..fb5ec32abc7 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp @@ -149,7 +149,7 @@ struct MatchAlgorithm { template <DfaMatcher Matcher, typename SuccessorT> static MatchResult match(const Matcher& matcher, std::string_view source, - SuccessorT* successor_out) + SuccessorT& successor_out) { using StateType = typename Matcher::StateType; Utf8Reader u8_reader(source.data(), source.size()); @@ -166,7 +166,7 @@ struct MatchAlgorithm { if (raw_mch != mch) { can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix } - if (successor_out && matcher.has_higher_out_edge(state, mch)) { + if (matcher.has_higher_out_edge(state, mch)) { last_state_with_higher_out = state; n_prefix_u8_bytes = u8_pos_before_char; char_after_prefix = mch; @@ -175,14 +175,12 @@ struct MatchAlgorithm { if (matcher.can_match(maybe_next)) { state = maybe_next; } else { - // Can never match; find the successor if requested - if (successor_out) { - emit_successor_prefix(*successor_out, source, n_prefix_u8_bytes, - matcher.is_cased() || can_use_raw_prefix); - assert(matcher.valid_state(last_state_with_higher_out)); - backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out, - char_after_prefix, *successor_out); - } + // Can never match; find the successor + emit_successor_prefix(successor_out, source, n_prefix_u8_bytes, + matcher.is_cased() || can_use_raw_prefix); + assert(matcher.valid_state(last_state_with_higher_out)); + backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out, + char_after_prefix, successor_out); return MatchResult::make_mismatch(max_edits()); } } @@ -190,10 +188,33 @@ struct MatchAlgorithm { if (edits <= max_edits()) { return MatchResult::make_match(max_edits(), edits); } - if (successor_out) { - emit_successor_prefix(*successor_out, source, source.size(), - matcher.is_cased() || can_use_raw_prefix); - emit_smallest_matching_suffix(matcher, state, *successor_out); + emit_successor_prefix(successor_out, source, source.size(), + matcher.is_cased() || can_use_raw_prefix); + emit_smallest_matching_suffix(matcher, state, successor_out); + return MatchResult::make_mismatch(max_edits()); + } + + /** + * Simplified match loop which does _not_ emit a successor on mismatch. Otherwise the + * exact same semantics as the successor-emitting `match()` overload. + */ + template <DfaMatcher Matcher> + static MatchResult match(const Matcher& matcher, std::string_view source) { + using StateType = typename Matcher::StateType; + Utf8Reader u8_reader(source.data(), source.size()); + StateType state = matcher.start(); + while (u8_reader.hasMore()) { + const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased()); + auto maybe_next = matcher.match_input(state, mch); + if (matcher.can_match(maybe_next)) { + state = maybe_next; + } else { + return MatchResult::make_mismatch(max_edits()); + } + } + const auto edits = matcher.match_edit_distance(state); + if (edits <= max_edits()) { + return MatchResult::make_match(max_edits(), edits); } return MatchResult::make_mismatch(max_edits()); } |