summaryrefslogtreecommitdiffstats
path: root/vespalib/src
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2023-09-21 09:51:36 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2023-09-21 09:54:42 +0000
commitec0d4b0ec140bb3313fcdb27f0b2745530303b41 (patch)
tree45f22ea1527760dde54bb6dc7e2152175e335229 /vespalib/src
parent7faeffcc5901ae88c1c3d1814665d0db6ca1d900 (diff)
Split core DFA match loop into match-only and successor-emitting specializations
This allows for "hybrid" schemes where raw matching (without successor generation) is done with a dedicated matcher implementation that is faster for that particular purpose. Also gives a much tighter loop for the match-only case and removes some branches from the successor-emitting case.
Diffstat (limited to 'vespalib/src')
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h9
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp15
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h9
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp15
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp6
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h5
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp49
7 files changed, 65 insertions, 43 deletions
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
index 630e07738fb..b7ac35b9d19 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -123,18 +123,17 @@ public:
_nodes[from_node_idx].set_wildcard_out_edge(to_node_idx);
}
- [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
+ [[nodiscard]] MatchResult match(std::string_view u8str) const override;
- [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override;
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::string& successor_out) const override;
+
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>& successor_out) const override;
[[nodiscard]] size_t memory_usage() const noexcept override {
return sizeof(DfaNodeType) * _nodes.size();
}
void dump_as_graphviz(std::ostream& os) const override;
-private:
- template <typename SuccessorT>
- [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const;
};
template <typename Traits>
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
index 7860d841fbf..0a371d3b277 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -94,23 +94,24 @@ struct ExplicitDfaMatcher {
};
template <uint8_t MaxEdits>
-template <typename SuccessorT>
LevenshteinDfa::MatchResult
-ExplicitLevenshteinDfaImpl<MaxEdits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const {
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str) const {
ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
- return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
+ return MatchAlgorithm<MaxEdits>::match(matcher, u8str);
}
template <uint8_t MaxEdits>
LevenshteinDfa::MatchResult
-ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
- return match_impl(u8str, successor_out);
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string& successor_out) const {
+ ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
+ return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
}
template <uint8_t MaxEdits>
LevenshteinDfa::MatchResult
-ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const {
- return match_impl(u8str, successor_out);
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
+ ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
+ return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
}
template <uint8_t MaxEdits>
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
index bb4a0918593..e12d8c3dedb 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -27,9 +27,11 @@ public:
~ImplicitLevenshteinDfa() override = default;
- [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
+ [[nodiscard]] MatchResult match(std::string_view u8str) const override;
- [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override;
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::string& successor_out) const override;
+
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>& successor_out) const override;
[[nodiscard]] size_t memory_usage() const noexcept override {
return _u32_str_buf.size() * sizeof(uint32_t);
@@ -37,9 +39,6 @@ public:
void dump_as_graphviz(std::ostream& os) const override;
private:
- template <typename SuccessorT>
- [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const;
-
void precompute_utf8_target_with_offsets();
};
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
index 25fd3fdcc4e..ff381b7ba7c 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -135,23 +135,24 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
};
template <typename Traits>
-template <typename SuccessorT>
LevenshteinDfa::MatchResult
-ImplicitLevenshteinDfa<Traits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const {
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str) const {
ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
- return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
+ return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str);
}
template <typename Traits>
LevenshteinDfa::MatchResult
-ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
- return match_impl(u8str, successor_out);
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string& successor_out) const {
+ ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
+ return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
}
template <typename Traits>
LevenshteinDfa::MatchResult
-ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const {
- return match_impl(u8str, successor_out);
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
+ ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
+ return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
}
template <typename Traits>
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index 6e38821851b..1caae408176 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -19,17 +19,17 @@ LevenshteinDfa::~LevenshteinDfa() = default;
LevenshteinDfa::MatchResult
LevenshteinDfa::match(std::string_view u8str) const {
- return _impl->match(u8str, static_cast<std::vector<uint32_t>*>(nullptr)); // TODO rewire
+ return _impl->match(u8str);
}
LevenshteinDfa::MatchResult
LevenshteinDfa::match(std::string_view u8str, std::string& successor_out) const {
- return _impl->match(u8str, &successor_out);
+ return _impl->match(u8str, successor_out);
}
LevenshteinDfa::MatchResult
LevenshteinDfa::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
- return _impl->match(u8str, &successor_out);
+ return _impl->match(u8str, successor_out);
}
size_t LevenshteinDfa::memory_usage() const noexcept {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index 85ad98e2a09..c6ca06d4de3 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -140,8 +140,9 @@ public:
struct Impl {
virtual ~Impl() = default;
- [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string* successor_out) const = 0;
- [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const = 0;
+ [[nodiscard]] virtual MatchResult match(std::string_view u8str) const = 0;
+ [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string& successor_out) const = 0;
+ [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::vector<uint32_t>& successor_out) const = 0;
[[nodiscard]] virtual size_t memory_usage() const noexcept = 0;
virtual void dump_as_graphviz(std::ostream& out) const = 0;
};
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index 2b3c06aa7cf..fb5ec32abc7 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -149,7 +149,7 @@ struct MatchAlgorithm {
template <DfaMatcher Matcher, typename SuccessorT>
static MatchResult match(const Matcher& matcher,
std::string_view source,
- SuccessorT* successor_out)
+ SuccessorT& successor_out)
{
using StateType = typename Matcher::StateType;
Utf8Reader u8_reader(source.data(), source.size());
@@ -166,7 +166,7 @@ struct MatchAlgorithm {
if (raw_mch != mch) {
can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix
}
- if (successor_out && matcher.has_higher_out_edge(state, mch)) {
+ if (matcher.has_higher_out_edge(state, mch)) {
last_state_with_higher_out = state;
n_prefix_u8_bytes = u8_pos_before_char;
char_after_prefix = mch;
@@ -175,14 +175,12 @@ struct MatchAlgorithm {
if (matcher.can_match(maybe_next)) {
state = maybe_next;
} else {
- // Can never match; find the successor if requested
- if (successor_out) {
- emit_successor_prefix(*successor_out, source, n_prefix_u8_bytes,
- matcher.is_cased() || can_use_raw_prefix);
- assert(matcher.valid_state(last_state_with_higher_out));
- backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out,
- char_after_prefix, *successor_out);
- }
+ // Can never match; find the successor
+ emit_successor_prefix(successor_out, source, n_prefix_u8_bytes,
+ matcher.is_cased() || can_use_raw_prefix);
+ assert(matcher.valid_state(last_state_with_higher_out));
+ backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out,
+ char_after_prefix, successor_out);
return MatchResult::make_mismatch(max_edits());
}
}
@@ -190,10 +188,33 @@ struct MatchAlgorithm {
if (edits <= max_edits()) {
return MatchResult::make_match(max_edits(), edits);
}
- if (successor_out) {
- emit_successor_prefix(*successor_out, source, source.size(),
- matcher.is_cased() || can_use_raw_prefix);
- emit_smallest_matching_suffix(matcher, state, *successor_out);
+ emit_successor_prefix(successor_out, source, source.size(),
+ matcher.is_cased() || can_use_raw_prefix);
+ emit_smallest_matching_suffix(matcher, state, successor_out);
+ return MatchResult::make_mismatch(max_edits());
+ }
+
+ /**
+ * Simplified match loop which does _not_ emit a successor on mismatch. Otherwise the
+ * exact same semantics as the successor-emitting `match()` overload.
+ */
+ template <DfaMatcher Matcher>
+ static MatchResult match(const Matcher& matcher, std::string_view source) {
+ using StateType = typename Matcher::StateType;
+ Utf8Reader u8_reader(source.data(), source.size());
+ StateType state = matcher.start();
+ while (u8_reader.hasMore()) {
+ const uint32_t mch = normalized_match_char(u8_reader.getChar(), matcher.is_cased());
+ auto maybe_next = matcher.match_input(state, mch);
+ if (matcher.can_match(maybe_next)) {
+ state = maybe_next;
+ } else {
+ return MatchResult::make_mismatch(max_edits());
+ }
+ }
+ const auto edits = matcher.match_edit_distance(state);
+ if (edits <= max_edits()) {
+ return MatchResult::make_match(max_edits(), edits);
}
return MatchResult::make_mismatch(max_edits());
}