1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
//
#include "tokenizer.h"
#include "juniperdebug.h"
#include <vespa/fastlib/text/wordfolder.h>
#include <cinttypes>
#include <vespa/log/log.h>
LOG_SETUP(".juniper.tokenizer");
JuniperTokenizer::JuniperTokenizer(const Fast_WordFolder* wordfolder,
const char* text, size_t len, ITokenProcessor* successor,
const juniper::SpecialTokenRegistry * registry) :
_wordfolder(wordfolder), _text(text), _len(len), _successor(successor), _registry(registry),
_charpos(0), _wordpos(0)
{ }
void JuniperTokenizer::SetText(const char* text, size_t len)
{
_text = text;
_len = len;
_charpos = 0;
_wordpos = 0;
}
// Scan the input and dispatch to the successor
void JuniperTokenizer::scan()
{
ITokenProcessor::Token token;
const char* src = _text;
const char* src_end = _text + _len;
const char* startpos = NULL;
ucs4_t* dst = _buffer;
ucs4_t* dst_end = dst + TOKEN_DSTLEN;
size_t result_len;
while (src < src_end)
{
if (_registry == NULL) {
// explicit prefetching seems to have negative effect with many threads
src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len);
} else {
const char * tmpSrc = _registry->tokenize(src, src_end, dst, dst_end, startpos, result_len);
if (tmpSrc == NULL) {
src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len);
} else {
src = tmpSrc;
}
}
if (dst[0] == 0) break;
token.curlen = result_len;
token.token = dst;
token.wordpos = _wordpos++;
token.bytepos = startpos - _text;
token.bytelen = src - startpos;
LOG(debug, "curlen %d, bytepos %" PRId64 ", bytelen %d",
token.curlen, static_cast<int64_t>(token.bytepos), token.bytelen);
// NB! not setting charlen/charpos/_utf8pos/_utf8len yet...!
_successor->handle_token(token);
}
token.bytepos = _len;
token.bytelen = 0;
token.token = NULL;
_successor->handle_end(token);
}
|