blob: 64e1e0ed4968006da7efa83dd1b6073e22946f42 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "query_normalization.h"
#include <vespa/fastlib/text/normwordfolder.h>
#include <ostream>
namespace search {
namespace {
const char *
to_str(search::Normalizing norm) noexcept {
switch (norm) {
case search::Normalizing::NONE:
return "NONE";
case search::Normalizing::LOWERCASE:
return "LOWERCASE";
case search::Normalizing::LOWERCASE_AND_FOLD:
return "LOWERCASE_AND_FOLD";
}
abort();
}
Normalizing
requireFold(TermType type, Normalizing normalizing) {
if (normalizing == Normalizing::NONE) return Normalizing::NONE;
if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE;
if (type == TermType::EXACTSTRINGTERM) return Normalizing::LOWERCASE;
return ((type == TermType::WORD) || (type == TermType::SUBSTRINGTERM) ||
(type == TermType::PREFIXTERM) || (type == TermType::SUFFIXTERM))
? Normalizing::LOWERCASE_AND_FOLD
: Normalizing::NONE;
}
vespalib::string
fold(vespalib::stringref s) {
const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
const unsigned char * end = curr + s.size();
vespalib::string folded;
for (; curr < end;) {
uint32_t c_ucs4 = *curr;
if (c_ucs4 < 0x80) {
folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++));
} else {
c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr);
const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4);
if (repl != nullptr) {
size_t repllen = strlen(repl);
folded.append(repl, repllen);
} else {
c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4);
char tmp[6];
const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
folded.append(tmp, tmp_end - tmp);
}
}
}
return folded;
}
vespalib::string
lowercase(vespalib::stringref s) {
const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
const unsigned char * end = curr + s.size();
vespalib::string folded;
for (; curr < end;) {
uint32_t c_ucs4 = *curr;
if (c_ucs4 < 0x80) {
folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++)));
} else {
c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr));
char tmp[6];
const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
folded.append(tmp, tmp_end - tmp);
}
}
return folded;
}
}
std::ostream &
operator<<(std::ostream &os, Normalizing n) {
os << to_str(n);
return os;
}
vespalib::string
QueryNormalization::optional_fold(vespalib::stringref s, TermType type, Normalizing normalizing) {
switch ( requireFold(type, normalizing)) {
case Normalizing::NONE: return s;
case Normalizing::LOWERCASE: return lowercase(s);
case Normalizing::LOWERCASE_AND_FOLD: return fold(s);
}
return s;
}
}
|