aboutsummaryrefslogtreecommitdiffstats
path: root/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
blob: 2c17283d20a17d32f284622d4211817a49fa5a54 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include <vespa/vespalib/stllike/string.h>
#include <vector>
#include <span>

namespace vespalib {

/**
 * Fuzzy matching between lowercased instances of query and document terms based on Levenshtein distance.
 * Class has two main parameters:
 * - prefix size, i.e the size of the prefix that is considered frozen.
 * - max edit distance, i.e an upper bound for edit distance for it to be a match between terms
 * Prefix size dictates of how match of a prefix is frozen,
 * i.e. if prefixes between the document and the query do not match (after lowercase)
 * matcher would return false early, without fuzzy match.
 */
class FuzzyMatcher {
private:
    static constexpr uint32_t DefaultPrefixSize = 0u;
    static constexpr uint32_t DefaultMaxEditDistance = 2u;

    uint32_t _max_edit_distance; // max edit distance
    uint32_t _prefix_size;       // prefix of a term that is considered frozen, i.e. non-fuzzy
    bool     _is_cased;
    bool     _is_prefix;

    std::vector<uint32_t> _folded_term_codepoints;

    std::span<const uint32_t> _folded_term_codepoints_prefix;
    std::span<const uint32_t> _folded_term_codepoints_suffix;

public:
    FuzzyMatcher();
    FuzzyMatcher(const FuzzyMatcher &) = delete;
    FuzzyMatcher & operator = (const FuzzyMatcher &) = delete;
    FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased, bool is_prefix);
    ~FuzzyMatcher();

    [[nodiscard]] bool isMatch(std::string_view target) const;
    [[nodiscard]] vespalib::string getPrefix() const;

    static std::span<const uint32_t> get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);
    static std::span<const uint32_t> get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);
};

}