blob: f680d9b6c47fbdfd65fa6ca39224230c488ada58 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
#include <vespa/searchlib/query/streaming/querynoderesultbase.h>
#include <vespa/searchlib/query/base.h>
#include <vespa/fastlib/text/normwordfolder.h>
namespace vsm {
/**
* Handles tokenization of utf8 input with on the fly normalization.
* It handles Normalizing::NONE, Normalizing::LOWERCASE, and Normalizing::LOWERCASE_AND_FOLD
*/
class TokenizeReader {
public:
using byte = search::byte;
using Normalizing = search::Normalizing;
TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
: _p(p),
_p_end(p + len),
_q(q),
_q_start(q)
{}
ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
void normalize(ucs4_t c, Normalizing normalize_mode) {
switch (normalize_mode) {
case Normalizing::LOWERCASE:
c = Fast_NormalizeWordFolder::lowercase(c);
[[fallthrough]];
case Normalizing::NONE:
*_q++ = c;
break;
case Normalizing::LOWERCASE_AND_FOLD:
fold(c);
break;
}
}
bool hasNext() const noexcept { return _p < _p_end; }
const byte * p() const noexcept { return _p; }
size_t complete() noexcept {
*_q = 0;
size_t token_len = _q - _q_start;
_q = _q_start;
return token_len;
}
template <bool exact_match>
size_t tokenize_helper(Normalizing norm_mode);
size_t tokenize(Normalizing norm_mode) { return tokenize_helper<false>(norm_mode); }
size_t tokenize_exact_match(Normalizing norm_mode) { return tokenize_helper<true>(norm_mode); }
private:
void fold(ucs4_t c);
const byte *_p;
const byte *_p_end;
ucs4_t *_q;
ucs4_t *_q_start;
};
}
|