searchlib/src/vespa/searchlib/query/query_normalization.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "query_normalization.h"
#include <vespa/fastlib/text/normwordfolder.h>
#include <ostream>

namespace search {

namespace {

const char *
to_str(search::Normalizing norm) noexcept {
    switch (norm) {
        case search::Normalizing::NONE:
            return "NONE";
        case search::Normalizing::LOWERCASE:
            return "LOWERCASE";
        case search::Normalizing::LOWERCASE_AND_FOLD:
            return "LOWERCASE_AND_FOLD";
    }
    abort();
}

Normalizing
requireFold(TermType type, Normalizing normalizing) {
    if (normalizing == Normalizing::NONE) return Normalizing::NONE;
    if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE;
    if (type == TermType::EXACTSTRINGTERM) return Normalizing::LOWERCASE;
    return ((type == TermType::WORD) || (type == TermType::SUBSTRINGTERM) ||
            (type == TermType::PREFIXTERM) || (type == TermType::SUFFIXTERM))
           ? Normalizing::LOWERCASE_AND_FOLD
           : Normalizing::NONE;
}

vespalib::string
fold(vespalib::stringref s) {
    const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
    const unsigned char * end = curr + s.size();
    vespalib::string folded;
    for (; curr < end;) {
        uint32_t c_ucs4 = *curr;
        if (c_ucs4 < 0x80) {
            folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++));
        } else {
            c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr);
            const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4);
            if (repl != nullptr) {
                size_t repllen = strlen(repl);
                folded.append(repl, repllen);
            } else {
                c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4);
                char tmp[6];
                const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
                folded.append(tmp, tmp_end - tmp);
            }
        }
    }
    return folded;
}

vespalib::string
lowercase(vespalib::stringref s) {
    const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
    const unsigned char * end = curr + s.size();
    vespalib::string folded;
    for (; curr < end;) {
        uint32_t c_ucs4 = *curr;
        if (c_ucs4 < 0x80) {
            folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++)));
        } else {
            c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr));
            char tmp[6];
            const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
            folded.append(tmp, tmp_end - tmp);
        }
    }
    return folded;
}

}

std::ostream &
operator<<(std::ostream &os, Normalizing n) {
    os << to_str(n);
    return os;
}

vespalib::string
QueryNormalization::optional_fold(vespalib::stringref s, TermType type, Normalizing normalizing) {
    switch ( requireFold(type, normalizing)) {
        case Normalizing::NONE: return s;
        case Normalizing::LOWERCASE: return lowercase(s);
        case Normalizing::LOWERCASE_AND_FOLD: return fold(s);
    }
    return s;
}

}