1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "explicit_levenshtein_dfa.h"
#include "implicit_levenshtein_dfa.h"
#include "table_dfa.h"
#include "levenshtein_dfa.h"
#include "unicode_utils.h"
#include <vespa/vespalib/util/stringfmt.h>
#include <memory>
namespace vespalib::fuzzy {
LevenshteinDfa::LevenshteinDfa(std::unique_ptr<Impl> impl) noexcept
: _impl(std::move(impl))
{}
LevenshteinDfa::LevenshteinDfa(LevenshteinDfa&&) noexcept = default;
LevenshteinDfa& LevenshteinDfa::operator=(LevenshteinDfa&&) noexcept = default;
LevenshteinDfa::~LevenshteinDfa() = default;
LevenshteinDfa::MatchResult
LevenshteinDfa::match(std::string_view u8str) const {
return _impl->match(u8str);
}
LevenshteinDfa::MatchResult
LevenshteinDfa::match(std::string_view u8str, std::string& successor_out) const {
return _impl->match(u8str, successor_out);
}
LevenshteinDfa::MatchResult
LevenshteinDfa::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
return _impl->match(u8str, successor_out);
}
size_t LevenshteinDfa::memory_usage() const noexcept {
return _impl->memory_usage();
}
void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const {
_impl->dump_as_graphviz(out);
}
LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) {
if (max_edits != 1 && max_edits != 2) {
throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits));
}
const bool is_cased = (casing == Casing::Cased);
auto target_string_u32 = is_cased ? utf8_string_to_utf32(target_string)
: utf8_string_to_utf32_lowercased(target_string);
if (dfa_type == DfaType::Implicit) {
if (max_edits == 1) {
return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased));
} else { // max_edits == 2
return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased));
}
} else if(dfa_type == DfaType::Explicit) {
if (max_edits == 1) {
return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa();
} else { // max_edits == 2
return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa();
}
} else { // DfaType::Table
if (max_edits == 1) {
return LevenshteinDfa(std::make_unique<TableDfa<1>>(std::move(target_string_u32), is_cased));
} else { // max_edits == 2
return LevenshteinDfa(std::make_unique<TableDfa<2>>(std::move(target_string_u32), is_cased));
}
}
}
LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) {
// TODO automatically select implementation based on target length/max edits?
// Suggestion:
// - Explicit DFA iff (k == 1 && |target| <= 256) || (k == 2 && |target| <= 64).
// - Implicit DFA otherwise.
// This keeps memory overhead < 64k and DFA construction time < 300 usec (measured on
// an M1 Pro; your mileage may vary etc).
// Ideally the implicit DFA would always be the fastest (or at least approximately as
// fast as the explicit DFA), but this is not yet the case.
return build(target_string, max_edits, casing, DfaType::Implicit);
}
std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos) {
if (mos.matches()) {
os << "match(" << static_cast<int>(mos.edits()) << " edits)";
} else {
os << "mismatch";
}
return os;
}
std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) {
if (dt == LevenshteinDfa::DfaType::Implicit) {
os << "Implicit";
} else if (dt == LevenshteinDfa::DfaType::Explicit) {
os << "Explicit";
} else {
assert(dt == LevenshteinDfa::DfaType::Table);
os << "Table";
}
return os;
}
std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c) {
if (c == LevenshteinDfa::Casing::Uncased) {
os << "Uncased";
} else {
assert(c == LevenshteinDfa::Casing::Cased);
os << "Cased";
}
return os;
}
}
|