streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include <vespa/searchlib/query/streaming/querynoderesultbase.h>
#include <vespa/searchlib/query/base.h>
#include <vespa/fastlib/text/normwordfolder.h>

namespace vsm {

/**
 * Handles tokenization of utf8 input with on the fly normalization.
 * It handles Normalizing::NONE, Normalizing::LOWERCASE, and Normalizing::LOWERCASE_AND_FOLD
 */
class TokenizeReader {
public:
    using byte = search::byte;
    using Normalizing = search::Normalizing;
    TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
        : _p(p),
          _p_end(p + len),
          _q(q),
          _q_start(q)
    {}
    ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
    void normalize(ucs4_t c, Normalizing normalize_mode) {
        switch (normalize_mode) {
            case Normalizing::LOWERCASE:
                c = Fast_NormalizeWordFolder::lowercase(c);
                [[fallthrough]];
            case Normalizing::NONE:
                *_q++ = c;
                break;
            case Normalizing::LOWERCASE_AND_FOLD:
                fold(c);
                break;
        }
    }
    bool hasNext() const noexcept { return _p < _p_end; }
    const byte * p() const noexcept { return _p; }
    size_t complete() noexcept {
        *_q = 0;
        size_t token_len = _q - _q_start;
        _q = _q_start;
        return token_len;
    }
    template <bool exact_match>
    size_t tokenize_helper(Normalizing norm_mode);
    size_t tokenize(Normalizing norm_mode) { return tokenize_helper<false>(norm_mode); }
    size_t tokenize_exact_match(Normalizing norm_mode) { return tokenize_helper<true>(norm_mode); }
private:
    void fold(ucs4_t c);
    const byte *_p;
    const byte *_p_end;
    ucs4_t     *_q;
    ucs4_t     *_q_start;
};

}