aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/fef/termfieldmatchdata.h
blob: 7eeb2833bcc77e8432fae7a930bee9ef27c31d80 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#pragma once

#include "fieldpositionsiterator.h"
#include "fieldinfo.h"
#include <vespa/searchlib/common/feature.h>
#include <cstring>
#include <limits>


namespace search::fef {

class TermMatchDataMerger;

/**
 * Match information for a single term within a single field.
 **/
class TermFieldMatchData
{
public:
    using PositionsIterator = const TermFieldMatchDataPosition *;
    using MutablePositionsIterator = TermFieldMatchDataPosition *;
    struct Positions {
        TermFieldMatchDataPosition *_positions;
        uint16_t                    _maxElementLength;
        uint16_t                    _allocated;
    };

    union Features {
        feature_t     _rawScore;
        unsigned char _position[sizeof(TermFieldMatchDataPosition)];
        Positions     _positions;
        uint64_t      _subqueries;
    };
private:
    bool  isRawScore()  const noexcept { return _flags & RAW_SCORE_FLAG; }
    bool  isMultiPos()  const noexcept { return _flags & MULTIPOS_FLAG; }
    bool  empty() const noexcept { return _sz == 0; }
    void  clear() noexcept { _sz = 0; }
    bool  allocated() const noexcept { return isMultiPos(); }
    const TermFieldMatchDataPosition * getFixed() const noexcept { return reinterpret_cast<const TermFieldMatchDataPosition *>(_data._position); }
    TermFieldMatchDataPosition * getFixed() noexcept { return reinterpret_cast<TermFieldMatchDataPosition *>(_data._position); }
    const TermFieldMatchDataPosition * getMultiple() const noexcept { return _data._positions._positions; }
    TermFieldMatchDataPosition * getMultiple() noexcept { return _data._positions._positions; }
    int32_t  getElementWeight() const noexcept { return empty() ? 1 : allocated() ? getMultiple()->getElementWeight() : getFixed()->getElementWeight(); }
    uint32_t getMaxElementLength() const noexcept { return empty() ? 0 : allocated() ? _data._positions._maxElementLength : getFixed()->getElementLen(); }
    void appendPositionToAllocatedVector(const TermFieldMatchDataPosition &pos);
    void allocateVector();
    void resizePositionVector(size_t sz) __attribute__((noinline));

    static constexpr uint16_t ILLEGAL_FIELD_ID = std::numeric_limits<uint16_t>::max();
    static constexpr uint16_t RAW_SCORE_FLAG = 1;
    static constexpr uint16_t MULTIPOS_FLAG = 2;
    static constexpr uint16_t UNPACK_NORMAL_FEATURES_FLAG = 4;
    static constexpr uint16_t UNPACK_INTERLEAVED_FEATURES_FLAG = 8;
    static constexpr uint16_t UNPACK_ALL_FEATURES_MASK = UNPACK_NORMAL_FEATURES_FLAG | UNPACK_INTERLEAVED_FEATURES_FLAG;

    uint32_t  _docId;
    uint16_t  _fieldId;
    uint16_t  _flags;
    uint16_t  _sz;

    // Number of occurrences and field length used when unpacking interleaved features.
    // This can exist in addition to full position features.
    uint16_t _numOccs;
    uint16_t _fieldLength;

    Features  _data;
public:
    PositionsIterator begin() const { return allocated() ? getMultiple() : getFixed(); }
    PositionsIterator end() const { return allocated() ? getMultiple() + _sz : empty() ? getFixed() : getFixed()+1; }
    size_t size() const noexcept { return _sz; }
    size_t capacity() const noexcept { return allocated() ? _data._positions._allocated : 1; }
    void reservePositions(size_t sz) {
        if (sz > capacity()) {
            if (!allocated()) {
                allocateVector();
                if (sz <= capacity()) return;
            }
            resizePositionVector(sz);
        }
    }

    /**
     * Create empty object. To complete object setup, field id must be
     * set.
     **/
    TermFieldMatchData();
    TermFieldMatchData(const TermFieldMatchData & rhs);
    ~TermFieldMatchData();
    TermFieldMatchData & operator = (const TermFieldMatchData & rhs);

    /**
     * Swaps the content of this object with the content of the given
     * term field match data object.
     *
     * @param rhs The object to swap with.
     **/
    void swap(TermFieldMatchData &rhs);

    MutablePositionsIterator populate_fixed();

    /**
     * Set which field this object has match information for.
     *
     * @return this object (for chaining)
     * @param fieldId field id
     **/
    TermFieldMatchData &setFieldId(uint32_t fieldId);

    /**
     * Obtain the field id
     *
     * @return field id
     **/
    uint32_t getFieldId() const noexcept {
        return __builtin_expect(_fieldId != ILLEGAL_FIELD_ID, true) ? _fieldId : IllegalFieldId;
    }

    /**
     * Reset the content of this match data and prepare it for use
     * with the given docid.
     *
     * @return this object (for chaining)
     * @param docId id of the document we are generating match information for
     **/
    TermFieldMatchData &reset(uint32_t docId) noexcept {
        _docId = docId;
        _sz = 0;
        _numOccs = 0;
        _fieldLength = 0;
        if (isRawScore()) {
            _data._rawScore = 0.0;
        } else if (isMultiPos()) {
            _data._positions._maxElementLength = 0;
        }
        return *this;
    }

    /**
     * Reset only the docid of this match data and prepare it for use
     * with the given docid. Assume all other are not touched.
     *
     * @return this object (for chaining)
     * @param docId id of the document we are generating match information for
     **/
    TermFieldMatchData &resetOnlyDocId(uint32_t docId) noexcept {
        _docId = docId;
        return *this;
    }

    /**
     * Indicate a match for a given docid and inject a raw score
     * instead of detailed match data. The raw score can be picked up
     * in the ranking framework by using the rawScore feature for the
     * appropriate field.
     *
     * @return this object (for chaining)
     * @param docId id of the document we have matched
     * @param score a raw score for the matched document
     **/
    TermFieldMatchData &setRawScore(uint32_t docId, feature_t score) noexcept {
        resetOnlyDocId(docId);
        enableRawScore();
        _data._rawScore = score;
        return *this;
    }
    TermFieldMatchData & enableRawScore() noexcept {
        _flags |= RAW_SCORE_FLAG;
        return *this;
    }

    /**
     * Obtain the raw score for this match data.
     *
     * @return raw score
     **/
    feature_t getRawScore() const noexcept {
        return __builtin_expect(isRawScore(), true) ? _data._rawScore : 0.0;
    }

    void setSubqueries(uint32_t docId, uint64_t subqueries) noexcept {
        resetOnlyDocId(docId);
        _data._subqueries = subqueries;
    }

    uint64_t getSubqueries() const noexcept {
        if (!empty() || isRawScore()) {
            return 0;
        }
        return _data._subqueries;
    }

    /**
     * Obtain the document id for which the data contained in this object is valid.
     *
     * @return document id
     **/
    uint32_t getDocId() const noexcept {
        return _docId;
    }

    /**
     * Obtain the weight of the first occurrence in this field, or 1
     * if no occurrences are present. This function is intended for
     * attribute matching calculations.
     *
     * @return weight
     **/
    int32_t getWeight() const noexcept {
        if (__builtin_expect(_sz == 0, false)) {
            return 1;
        }
        return __builtin_expect(allocated(), false) ? getMultiple()->getElementWeight() : getFixed()->getElementWeight();
    }

    /**
     * Add occurrence information to this match data for the current
     * document.
     *
     * @return this object (for chaining)
     * @param pos low-level occurrence information
     **/
    TermFieldMatchData &appendPosition(const TermFieldMatchDataPosition &pos) {
        if (_sz == 0 && !allocated()) {
            _sz = 1;
            new (_data._position) TermFieldMatchDataPosition(pos);
        } else {
            if (!allocated()) {
                allocateVector();
            }
            appendPositionToAllocatedVector(pos);
        }
        return *this;
    }

    /**
     * Obtain an object that gives access to the low-level occurrence
     * information stored in this object.
     *
     * @return field position iterator
     **/
    FieldPositionsIterator getIterator() const {
        const uint32_t len(getMaxElementLength());
        return FieldPositionsIterator(len != 0 ? len : FieldPositionsIterator::UNKNOWN_LENGTH, begin(), end());
    }

    uint16_t getNumOccs() const noexcept { return _numOccs; }
    uint16_t getFieldLength() const noexcept { return _fieldLength; }

    void setNumOccs(uint16_t value) { _numOccs = value; }
    void setFieldLength(uint16_t value) { _fieldLength = value; }

    /**
     * This indicates if this instance is actually used for ranking or not.
     * @return true if it is not needed.
     */
    bool isNotNeeded() const noexcept {
        return ((_flags & (UNPACK_NORMAL_FEATURES_FLAG | UNPACK_INTERLEAVED_FEATURES_FLAG)) == 0u);
    }

    bool needs_normal_features() const noexcept { return ((_flags & UNPACK_NORMAL_FEATURES_FLAG) != 0u); }

    bool needs_interleaved_features() const noexcept{ return ((_flags & UNPACK_INTERLEAVED_FEATURES_FLAG) != 0u); }

    /**
     * Tag that this instance is not really used for ranking.
     */
    void tagAsNotNeeded() noexcept {
        _flags &=  ~(UNPACK_NORMAL_FEATURES_FLAG | UNPACK_INTERLEAVED_FEATURES_FLAG);
    }

    /**
     * Tag that this instance is used for ranking (normal features)
     */
    void setNeedNormalFeatures(bool needed) noexcept {
        if (needed) {
            _flags |= UNPACK_NORMAL_FEATURES_FLAG;
        } else {
            _flags &= ~UNPACK_NORMAL_FEATURES_FLAG;
        }
    }

    /**
     * Tag that this instance is used for ranking (interleaved features)
     */
    void setNeedInterleavedFeatures(bool needed) noexcept {
        if (needed) {
            _flags |= UNPACK_INTERLEAVED_FEATURES_FLAG;
        } else {
            _flags &= ~UNPACK_INTERLEAVED_FEATURES_FLAG;
        }
    }

    /**
     * Special docId value indicating that no data has been saved yet.
     * This should match (or be above) endId() in search::queryeval::SearchIterator.
     *
     * @return constant
     **/
    static uint32_t invalidId() noexcept { return 0xdeadbeefU; }
};

}