searchsummary/src/vespa/juniper/tokenizer.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
//
#include "tokenizer.h"
#include "juniperdebug.h"
#include <vespa/fastlib/text/wordfolder.h>
#include <cinttypes>

#include <vespa/log/log.h>
LOG_SETUP(".juniper.tokenizer");

JuniperTokenizer::JuniperTokenizer(const Fast_WordFolder* wordfolder,
				   const char* text, size_t len, ITokenProcessor* successor,
                                   const juniper::SpecialTokenRegistry * registry) :
    _wordfolder(wordfolder), _text(text), _len(len), _successor(successor), _registry(registry),
    _charpos(0), _wordpos(0)
{ }


void JuniperTokenizer::SetText(const char* text, size_t len)
{
    _text = text;
    _len = len;
    _charpos = 0;
    _wordpos = 0;
}


// Scan the input and dispatch to the successor
void JuniperTokenizer::scan()
{
    ITokenProcessor::Token token;

    const char* src = _text;
    const char* src_end = _text + _len;
    const char* startpos = NULL;
    ucs4_t* dst = _buffer;
    ucs4_t* dst_end = dst + TOKEN_DSTLEN;
    size_t result_len;

    while (src < src_end)
    {
        if (_registry == NULL) {
            // explicit prefetching seems to have negative effect with many threads
            src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len);
        } else {
            const char * tmpSrc = _registry->tokenize(src, src_end, dst, dst_end, startpos, result_len);
            if (tmpSrc == NULL) {
                src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len);
            } else {
                src = tmpSrc;
            }
        }
        if (dst[0] == 0) break;
        token.curlen = result_len;
        token.token = dst;
        token.wordpos = _wordpos++;
        token.bytepos = startpos - _text;
        token.bytelen = src - startpos;
        LOG(debug, "curlen %d, bytepos %" PRId64 ", bytelen %d",
            token.curlen, static_cast<int64_t>(token.bytepos), token.bytelen);
        // NB! not setting charlen/charpos/_utf8pos/_utf8len yet...!
        _successor->handle_token(token);
    }
    token.bytepos = _len;
    token.bytelen = 0;
    token.token = NULL;
    _successor->handle_end(token);
}