aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
blob: c217a7b8866d8164502e9716d04aeaf0ce0f1c15 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include "strchrfieldsearcher.h"

namespace vsm {

/**
 * This class is the base class for all utf8 string searchers.
 * It contains utility functions used by the other searchers.
 * As normal the prepare method is called
 * after the query is built. A SharedSearcherBuf is used given to it. This is a
 * buffer that is shared among all searchers that are run in the same context.
 * Reuse of this buffer ensures better cache hit ratio because this is just a
 * scratchpad for tokenizing. It will grow till the max size and stay there.
 **/
class UTF8StringFieldSearcherBase : public StrChrFieldSearcher
{
public:
    /**
     * Template class that wraps an ucs4 buffer.
     * Used when invoking skipSeparators() during substring matching.
     **/
    class BufferWrapper
    {
    protected:
        ucs4_t * _bbuf;
        ucs4_t * _cbuf;

    public:
        explicit BufferWrapper(ucs4_t * buf) noexcept : _bbuf(buf), _cbuf(buf) { }
        BufferWrapper(ucs4_t * buf, size_t *) noexcept : _bbuf(buf), _cbuf(buf) { }
        void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; }
        void onOffset(size_t) { }
        void incBuf(size_t inc) { _cbuf += inc; }
        ucs4_t * getBuf() { return _cbuf; }
        bool valid() const noexcept { return true; }
        size_t size() const noexcept { return (_cbuf - _bbuf); }
        bool hasOffsets() const noexcept { return false; }
    };

    /**
     * Template class that wraps an offset buffer in addition to an ucs4 buffer.
     * The offset buffer contains offsets into the original utf8 buffer.
     **/
    class OffsetWrapper : public BufferWrapper
    {
    private:
        size_t * _boff;
        size_t * _coff;

    public:
        explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
        void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; }
        void onOffset(size_t of) { *_coff++ = of; }
        bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); }
        bool hasOffsets() const noexcept { return true; }
    };

protected:
    SharedSearcherBuf _buf;

    /**
     * Matches the given query term against the words in the given field reference
     * using exact or prefix match strategy.
     *
     * @param f  the field reference to match against.
     * @param qt the query term trying to match.
     * @return   the number of words in the field ref.
     **/
    size_t matchTermRegular(const FieldRef & f, search::streaming::QueryTerm & qt);

    /**
     * Matches the given query term against the characters in the given field reference
     * using substring match strategy.
     *
     * @param f  the field reference to match against.
     * @param qt the query term trying to match.
     * @return   the number of words in the field ref.
     **/
    size_t matchTermSubstring(const FieldRef & f, search::streaming::QueryTerm & qt);

    /**
     * Matches the given query term against the words in the given field reference
     * using suffix match strategy.
     *
     * @param f  the field reference to match against.
     * @param qt the query term trying to match.
     * @return   the number of words in the field ref.
     **/
    size_t matchTermSuffix(const FieldRef & f, search::streaming::QueryTerm & qt);

    /**
     * Matches the given query term against the words in the given field reference
     * using exact match strategy.
     *
     * @param f  the field reference to match against.
     * @param qt the query term trying to match.
     * @return   the number of words in the field ref.
     **/
    size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt);

public:
    explicit UTF8StringFieldSearcherBase(FieldIdT fId);
    ~UTF8StringFieldSearcherBase() override;
    void prepare(search::streaming::QueryTermList& qtl,
                 const SharedSearcherBuf& buf,
                 const vsm::FieldPathMapT& field_paths,
                 search::fef::IQueryEnvironment& query_env) override;
    /**
     * Matches the given query term against the given word using suffix match strategy.
     *
     * @param term the buffer with the term.
     * @param termLen the length of the term.
     * @param word the buffer with the word.
     * @param wordlen the length of the word.
     * @return true if the term matches the word.
     **/
    static bool matchTermSuffix(const cmptype_t * term, size_t termlen,
                                const cmptype_t * word, size_t wordlen);

    /**
     * Checks whether the given character is a separator character.
     **/
    static bool isSeparatorCharacter(ucs4_t);

    /**
     * Transforms the given utf8 array into an array of ucs4 characters.
     * Folding is performed. Separator characters are skipped.
     **/
    template <typename T>
    size_t skipSeparators(const search::byte * p, size_t sz, T & dstbuf);

};

}