aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
blob: 3ae2bdd3598e4b89846c628d520c2b62684d31a2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "dfa_fuzzy_matcher.h"
#include "i_enum_store_dictionary.h"
#include <vespa/vespalib/text/utf8.h>
#include <vespa/vespalib/text/lowercase.h>

using vespalib::fuzzy::LevenshteinDfa;
using vespalib::LowerCase;
using vespalib::Utf8Reader;
using vespalib::Utf8ReaderForZTS;

namespace search::attribute {

namespace {

std::vector<uint32_t>
extract_prefix(std::string_view target, uint32_t prefix_size, bool cased)
{
    std::vector<uint32_t> result;
    result.reserve(prefix_size);
    Utf8Reader reader(vespalib::stringref(target.data(), target.size()));
    for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) {
        uint32_t code_point = reader.getChar();
        if (!cased) {
            code_point = LowerCase::convert(code_point);
        }
        result.emplace_back(code_point);
    }
    return result;
}

std::string_view
extract_suffix(std::string_view target, uint32_t prefix_size)
{
    Utf8Reader reader(vespalib::stringref(target.data(), target.size()));
    for (size_t pos = 0; pos < prefix_size && reader.hasMore(); ++pos) {
        (void) reader.getChar();
    }
    std::string_view result = target;
    result.remove_prefix(reader.getPos());
    return result;
}

}

DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size,
                                 bool cased, bool prefix_match, LevenshteinDfa::DfaType dfa_type)
    : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits,
                                                  (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased),
                                                  dfa_type, // TODO reorder args
                                                  (prefix_match ? LevenshteinDfa::Matching::Prefix : LevenshteinDfa::Matching::FullString))),
      _successor(),
      _prefix(extract_prefix(target, prefix_size, cased)),
      _prefix_size(prefix_size),
      _cased(cased)
{
    _successor = _prefix;
}

DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match)
    : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Table)
{
}

DfaFuzzyMatcher::~DfaFuzzyMatcher() = default;

const char*
DfaFuzzyMatcher::skip_prefix(const char* word) const
{
    Utf8ReaderForZTS reader(word);
    size_t pos = 0;
    for (; pos < _prefix.size() && reader.hasMore(); ++pos) {
        (void) reader.getChar();
    }
    assert(pos == _prefix.size());
    return reader.get_current_ptr();
}

bool
DfaFuzzyMatcher::is_match(std::string_view word) const
{
    if (_prefix_size > 0) {
        Utf8Reader reader(word.data(), word.size());
        size_t pos = 0;
        for (; pos < _prefix.size() && reader.hasMore(); ++pos) {
            uint32_t code_point = reader.getChar();
            if (!_cased) {
                code_point = LowerCase::convert(code_point);
            }
            if (code_point != _prefix[pos]) {
                break;
            }
        }
        if (!reader.hasMore() && pos == _prefix.size() && pos < _prefix_size) {
            return true;
        }
        if (pos != _prefix_size) {
            return false;
        }
        word = word.substr(reader.getPos());
    }
    auto match = _dfa.match(word);
    return match.matches();
}

template <typename DictionaryConstIteratorType>
bool
DfaFuzzyMatcher::is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) {
    if (_prefix_size > 0) {
        word = skip_prefix(word);
        if (_prefix.size() < _prefix_size) {
            if (*word == '\0') {
                return true;
            }
            _successor.resize(_prefix.size());
            _successor.emplace_back(beyond_unicode);
        } else {
            _successor.resize(_prefix.size());
            auto match = _dfa.match(word, _successor);
            if (match.matches()) {
                return true;
            }
        }
    } else {
        _successor.clear();
        auto match = _dfa.match(word, _successor);
        if (match.matches()) {
            return true;
        }
    }
    DfaStringComparator cmp(data_store, _successor, _cased);
    assert(cmp.less(itr.getKey().load_acquire(), vespalib::datastore::EntryRef()));
    itr.seek(vespalib::datastore::AtomicEntryRef(), cmp);
    return false;
}

template
bool
DfaFuzzyMatcher::is_match(const char* word, EnumPostingTree::ConstIterator& itr, const DfaStringComparator::DataStoreType& data_store);

template
bool
DfaFuzzyMatcher::is_match(const char* word, EnumTree::ConstIterator& itr, const DfaStringComparator::DataStoreType& data_store);

}