summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHÃ¥vard Pettersen <3535158+havardpe@users.noreply.github.com>2023-09-29 11:57:01 +0200
committerGitHub <noreply@github.com>2023-09-29 11:57:01 +0200
commit9887e821f2a5a870276416ad6c9f508f4a03dae9 (patch)
treeae1382d58adc743775fca7cacdee7c9037194b8c /vespalib
parent5b3c08a08be7984fc7bceaf6e56619c2c81da34d (diff)
parent5a62ffcd58eb7aa73ed2717fea2d9a8708d6381f (diff)
Merge pull request #28714 from vespa-engine/havardpe/better-graphviz-for-table-dfa
dump table_dfa as actual dfa in graphviz
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp16
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp158
2 files changed, 133 insertions, 41 deletions
diff --git a/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
index 4ff48348790..acb68a56de7 100644
--- a/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/table_dfa/table_dfa_test.cpp
@@ -284,14 +284,6 @@ TEST(TableDfaTest, minimal_boundary_will_never_exceed_word_end_with_max_edits_2)
verify_word_end_boundary<2>();
}
-TEST(TableDfaTest, graphviz_for_food_with_max_edits_1) {
- auto dfa = LevenshteinDfa::build("food", 1, LevenshteinDfa::Casing::Cased, LevenshteinDfa::DfaType::Table);
- std::ostringstream out;
- dfa.dump_as_graphviz(out);
- fprintf(stderr, "memory usage: %zu\n", dfa.memory_usage());
- fprintf(stderr, "%s", out.str().c_str());
-}
-
template <uint8_t N>
void verify_inline_tfa() {
auto tfa = make_tfa<N>();
@@ -392,4 +384,12 @@ TEST(TableDfaTest, graphviz_for_tfa_with_max_edits_1) {
dump_tfa_graph<1>();
}
+TEST(TableDfaTest, graphviz_for_food_with_max_edits_1) {
+ auto dfa = LevenshteinDfa::build("food", 1, LevenshteinDfa::Casing::Cased, LevenshteinDfa::DfaType::Table);
+ std::ostringstream out;
+ dfa.dump_as_graphviz(out);
+ fprintf(stderr, "memory usage: %zu\n", dfa.memory_usage());
+ fprintf(stderr, "%s", out.str().c_str());
+}
+
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
index d3cc9002e2b..eebf1f4429d 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/table_dfa.hpp
@@ -8,6 +8,8 @@
#include <stdexcept>
#include <algorithm>
#include <map>
+#include <set>
+#include <queue>
namespace vespalib::fuzzy {
@@ -286,24 +288,65 @@ vespalib::string format_vector(const std::vector<T> &vector, bool compact = fals
return str;
}
+// A table-based state using the InlineTfa tables to simulate stepping
+// a dfa with max edit distance N. The state itself is represented by
+// a number used as offset into these tables (state). Since the state
+// is parametric, we also store the minimal boundary of the state
+// separately (index).
template <uint8_t N>
-struct TableMatcher {
- struct S {
- uint32_t index;
- uint32_t state;
- // needed by dfa matcher concept (should use std::declval instead)
- constexpr S() noexcept : index(0), state(0) {}
- constexpr S(uint32_t i, uint32_t s) noexcept : index(i), state(s) {}
- S next(uint32_t bits) noexcept {
- const auto &entry = InlineTfa<N>::table[state][bits];
- return S(index + entry.step, entry.state);
+struct TfaState {
+ uint32_t index;
+ uint32_t state;
+ // needed by dfa matcher concept (should use std::declval instead)
+ constexpr TfaState() noexcept : index(0), state(0) {}
+ constexpr TfaState(uint32_t i, uint32_t s) noexcept : index(i), state(s) {}
+ static constexpr TfaState start() { return TfaState(0, 1); }
+ constexpr bool valid() const noexcept { return state != 0; }
+ constexpr TfaState next(uint32_t bits) const noexcept {
+ const auto &entry = InlineTfa<N>::table[state][bits];
+ return TfaState(index + entry.step, entry.state);
+ }
+ constexpr bool is_valid_edge(uint32_t bits) const noexcept {
+ return InlineTfa<N>::table[state][bits].state != 0;
+ }
+ constexpr uint8_t edits(uint32_t end) const noexcept {
+ uint32_t leap = (end - index);
+ return (leap < window_size<N>()) ? InlineTfa<N>::edits[state][leap] : N + 1;
+ }
+ // for pretty graphviz dumping; minimal possible edits given perfect input from here on
+ constexpr uint32_t min_edits() const noexcept {
+ uint8_t res = N + 1;
+ for (uint32_t i = 0; i < window_size<N>(); ++i) {
+ res = std::min(res, InlineTfa<N>::edits[state][i]);
}
- constexpr bool is_valid_edge(uint32_t bits) const noexcept {
- return InlineTfa<N>::table[state][bits].state != 0;
+ return res;
+ }
+ // for pretty graphviz dumping; actual edits needed to reach the word end from a valid state
+ constexpr uint32_t exact_edits(uint32_t end) const noexcept {
+ assert(valid());
+ uint32_t res = end;
+ for (uint32_t i = 0; i < window_size<N>(); ++i) {
+ if (uint32_t e = InlineTfa<N>::edits[state][i]; e <= N) {
+ res = std::min(res, e + diff(index + i, end));
+ }
}
- };
- using StateType = S;
- using StateParamType = StateType;
+ return res;
+ }
+ // for pretty graphviz dumping; enable using in map and set
+ bool operator<(const TfaState &rhs) const noexcept {
+ return std::tie(index, state) < std::tie(rhs.index, rhs.state);
+ }
+ // for pretty graphviz dumping; check for redundant edges
+ bool operator==(const TfaState &rhs) const noexcept {
+ return std::tie(index, state) == std::tie(rhs.index, rhs.state);
+ }
+};
+
+template <uint8_t N>
+struct TableMatcher {
+ using S = TfaState<N>;
+ using StateType = TfaState<N>;
+ using StateParamType = TfaState<N>;
using EdgeType = uint32_t;
const TableDfa<N>::Lookup *lookup;
@@ -314,16 +357,13 @@ struct TableMatcher {
noexcept : lookup(lookup_in), end(end_in), cased(cased_in) {}
bool is_cased() const noexcept { return cased; }
- static constexpr S start() noexcept { return S(0, 1); }
+ static constexpr S start() noexcept { return S::start(); }
- uint8_t match_edit_distance(S s) const noexcept {
- uint32_t leap = (end - s.index);
- return (leap < window_size<N>()) ? InlineTfa<N>::edits[s.state][leap] : N + 1;
- }
- bool is_match(S s) const noexcept { return match_edit_distance(s) <= N; }
+ uint8_t match_edit_distance(S s) const noexcept { return s.edits(end); }
+ bool is_match(S s) const noexcept { return s.edits(end) <= N; }
- static constexpr bool can_match(S s) noexcept { return (s.state != 0); }
- static constexpr bool valid_state(S s) noexcept { return (s.state != 0); }
+ static constexpr bool can_match(S s) noexcept { return s.valid(); }
+ static constexpr bool valid_state(S) noexcept { return true; }
S match_wildcard(S s) const noexcept { return s.next(0); }
S match_input(S s, uint32_t c) const noexcept {
@@ -381,7 +421,7 @@ struct TableMatcher {
return 0;
}
- static constexpr bool valid_edge(uint32_t c) noexcept { return c != 0; }
+ static constexpr bool valid_edge(uint32_t) noexcept { return true; }
static constexpr uint32_t edge_to_u32char(uint32_t c) noexcept { return c; }
S edge_to_state(S s, uint32_t c) const noexcept { return match_input(s, c); }
@@ -476,16 +516,68 @@ template <uint8_t N>
void
TableDfa<N>::dump_as_graphviz(std::ostream &os) const
{
+ using state_t = TfaState<N>;
+ auto id_of = [state_ids = std::map<state_t,uint32_t>()](state_t state) mutable {
+ return state_ids.emplace(state, state_ids.size()).first->second;
+ };
+ auto explore = [explored = std::set<state_t>()](state_t state) mutable {
+ return explored.insert(state).second;
+ };
+ auto edits = [end = (_lookup.size() - 1)](state_t state) noexcept -> uint32_t {
+ return state.exact_edits(end);
+ };
+ struct Edge {
+ uint32_t input;
+ state_t from;
+ state_t to;
+ operator bool() const { return to.valid(); }
+ };
+ auto best_edge = [&](const Edge &a, const Edge &b) {
+ // inverted due to priority queue
+ if (a.to.min_edits() == b.to.min_edits()) {
+ return edits(a.to) > edits(b.to);
+ }
+ return a.to.min_edits() > b.to.min_edits();
+ };
+ auto todo = std::priority_queue<Edge,std::vector<Edge>,decltype(best_edge)>(best_edge);
+ auto handle_state = [&](state_t state) {
+ if (explore(state)) {
+ // number states by following the best edges first
+ auto my_id = id_of(state);
+ if (edits(state) <= N) {
+ os << " " << my_id << " [label=\"" << my_id << "(" << edits(state) << ")\", style=\"filled\"];\n";
+ }
+ auto null_edge = Edge{0, state, state.next(0)};
+ if (null_edge) {
+ todo.push(null_edge);
+ }
+ for (auto [c, bits]: _lookup[state.index].list) {
+ if (auto edge = Edge{c, state, state.next(bits)}; edge && edge.to != null_edge.to) {
+ // only process valid out edges that are not covered by the null_edge
+ todo.push(edge);
+ }
+ }
+ }
+ };
+ auto handle_edge = [&](Edge edge) {
+ handle_state(edge.to);
+ if (edge.input != 0) {
+ std::string as_utf8;
+ append_utf32_char(as_utf8, edge.input);
+ os << " " << id_of(edge.from) << " -> " << id_of(edge.to) << " [label=\"" << as_utf8 << "\"];\n";
+ } else {
+ os << " " << id_of(edge.from) << " -> " << id_of(edge.to) << " [label=\"*\"];\n";
+ }
+ };
os << std::dec << "digraph table_dfa {\n";
- for (size_t i = 0; i < _lookup.size(); ++i) {
- for (size_t j = 0; j < window_size(); ++j) {
- if (_lookup[i].list[j].input != 0) {
- std::string as_utf8;
- append_utf32_char(as_utf8, _lookup[i].list[j].input);
- os << " x" << i << " -> " << format_vector(expand_bits<N>(_lookup[i].list[j].match), true) << " [label=\"" << as_utf8 << "\"];\n";
- }
- }
- os << " x" << i << " -> " << format_vector(expand_bits<N>(0), true) << " [label=\"*\"];\n";
+ os << " fontname=\"Helvetica,Arial,sans-serif\"\n";
+ os << " node [shape=circle, fontname=\"Helvetica,Arial,sans-serif\", fixedsize=true];\n";
+ os << " edge [fontname=\"Helvetica,Arial,sans-serif\"];\n";
+ handle_state(state_t::start());
+ while (!todo.empty()) {
+ auto next_edge = todo.top();
+ todo.pop();
+ handle_edge(next_edge);
}
os << "}\n";
}