aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h
blob: 61077c9e91c0342bdf6445b6a30d79ff63d26ae1 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include "metrics.h"
#include "params.h"
#include "segmentstart.h"
#include "simplemetrics.h"
#include <vespa/searchlib/features/queryterm.h>
#include <vespa/searchlib/common/allocatedbitvector.h>
#include <vespa/vespalib/util/arrayref.h>

namespace search::fef {

class PhraseSplitter;
class TermFieldMatchData;

}

namespace search::features::fieldmatch {

class ComputerSharedState;

/**
 * <p>Calculates a set of metrics capturing information about the degree of agreement between a query and a field
 * string. This algorithm attempts to capture the property of text that very close tokens are usuall part of the same
 * semantic structure, while tokens farther apart are much more loosely related.  The algorithm will locate alternative
 * such regions containing multiple query tokens (segments), do a more detailed analysis of these segments and choose
 * the ones producing the best overall set of match metrics.</p>
 *
 * <p>Such segments are found by looking at query terms in sequence from left top right and finding matches in the
 * field. All alternative segment start points are explored, and the segmentation achieving the best overall string
 * match metric score is preferred. The dynamic programming paradigm is used to avoid redoing work on segmentations.</p>
 *
 * <p>When a segment start point is found, subsequenc tokens from the query are searched in the field from this starting
 * point in "semantic order". This search order can be defined independently of the algorithm. The current order
 * searches <i>proximityLimit tokens ahead first, then the same distance backwards (so if you need to go two steps
 * backwards in the field from the segment starting point, the real distance is -2, but the "semantic distance" is
 * proximityLimit+2.</p>
 *
 * <p>The actual metrics are calculated during execution of this algorithm by the {@link Metrics} class, by
 * receiving events emitted from the algorithm. Any set of metrics derivable from these events a computable using this
 * algorithm.</p>
 *
 * <p>Terminology:
 * <ul>
 * <li><b>Sequence</b> - A set of adjacent matched tokens in the field.</li>
 * <li><b>Segment</b> - A field area containing matches to a continuous section of the query.</li>
 * <li><b>Gap</b> - A chunk of adjacent tokens <i>inside a segment</i> separating two matched characters.</li>
 * <li><b>Semantic distance</b> - A non-continuous distance between tokens in j.</li>
 * </ul>
 *
 * <p>Notation: A position index in the query is denoted <code>i</code>. A position index in the field is denoted
 * <code>j</code>.</p>
 *
 * <p>This class is not multithread safe, but is reusable across queries for a single thread.</p>
 *
 * @author <a href="mailto:bratseth@yahoo-inc.com">Jon Bratseth</a>
 * @author Simon Thoresen Hult
 * @version $Id$
 */
class Computer {
public:
    /**
     * Constructs a new computer object.
     *
     * @param shared_state      The shared state for this computer
     * @param splitter          The environment that holds all query information.
     */
    Computer(const ComputerSharedState& shared_state, const fef::PhraseSplitter& splitter);

    ~Computer();

    /**
     * Resets this object according to the given document id
     *
     * @param docid The local document id to be evaluated
     */
    void reset(uint32_t docId);

    /**
     * Runs this computer using the environment, match and parameters given to the constructor.
     *
     * @return The final metrics.
     */
    const Metrics & run();

    /**
     * Returns the final metrics.
     *
     * @return The final metrics.
     */
    const Metrics & getFinalMetrics() const {
        return _finalMetrics;
    }

    /**
     * Implements the prefered search order for finding a match to a query item - first
     * looking close in the right order, then close in the reverse order, then far in the right order
     * and lastly far in the reverse order.
     *
     * @param i                     The query term index.
     * @param previousJ             The previous field index.
     * @param startSemanticDistance The semantic distance we must be larger than or equal to.
     * @return The semantic distance of the next mathing j larger than startSemanticDistance, or -1 if
     *         there are no matches larger than startSemanticDistance
     */
    int findClosestInFieldBySemanticDistance(int i, int previousJ, uint32_t startSemanticDistance);

    /**
     * Returns the field index (j) from a starting point zeroJ and the distance form zeroJ in the
     * semantic distance space.
     *
     * @param semanticDistance The semantic distance to transform to field index.
     * @param zeroJ            The starting point.
     * @returns The field index, or -1 (undefined) if the semanticDistance is -1.
     */
    int semanticDistanceToFieldIndex(int semanticDistance, uint32_t zeroJ) const;

    /**
     * Returns the semantic distance from a starting point zeroJ to a field index j.
     *
     * @param j     The field index to transform to semantic distance.
     * @param zeroJ The starting point.
     * @returns The semantic distance, or -1 (undefined) if j is -1.
     */
    int fieldIndexToSemanticDistance(int j, uint32_t zeroJ) const;

    /**
     * Returns the id of the searched field.
     *
     * @return The field id.
     */
    uint32_t getFieldId() const {
        return _fieldId;
    }

    /**
     * Returns the number of terms present in the searched field.
     *
     * @return The field length.
     */
    uint32_t getFieldLength() const {
        return _fieldLength;
    }

    /**
     * Returns the parameter object that was used to instantiate this.
     *
     * @return The parameters.
     */
    const Params &getParams() const {
        return _params;
    }

    /**
     * Returns the number of terms searching on this field.
     *
     * @return The number of terms.
     */
    uint32_t getNumQueryTerms() const {
        return _queryTerms.size();
    }

    /**
     * Returns the query term data for a specified term.
     *
     * @param  The index of the term to return.
     * @return The query term data.
     */
    const QueryTerm & getQueryTermData(int term) const {
        return _queryTerms[term];
    }

    /**
     * Returns the term match for a specified term.
     *
     * @param  The index of the term match to return.
     * @return The term match.
     */
    const fef::TermFieldMatchData *getQueryTermFieldMatch(int term) const {
        return _queryTermFieldMatch[term];
    }

    /**
     * Returns the total weight of all query terms.
     *
     * @return The total weight.
     */
    uint32_t getTotalTermWeight() const {
        return _totalTermWeight;
    }

    /**
     * Returns the total significance of all query terms.
     *
     * @return The total significance.
     */
    feature_t getTotalTermSignificance() const {
        return _totalTermSignificance;
    }

    /**
     * Returns a string representation of this computer.
     *
     * @return A string representation.
     */
    vespalib::string toString() const;

    /**
     * Returns the simple metrics computed while traversing the list of query terms in the constructor.
     *
     * @return the simple metrics object.
     */
    const SimpleMetrics & getSimpleMetrics() const {
        return _simpleMetrics;
    }


private:
    /**
     * Finds segment candidates and explores them until we have the best segmentation history of the entire query.
     */
    void exploreSegments();

    /**
     * Find correspondences from a segment starting point startI.
     *
     * @param segment The segment starting point.
     * @return True if a segment was found, false if none could be found.
     */
    bool findAlternativeSegmentFrom(SegmentStart *segment);

    /**
     * A match occured within a segment, report this to the metric as appropriate.
     *
     * @param i         The current query term index.
     * @param j         The current field term index.
     * @param previousJ The previous field term index.
     * @param previousI The previous query term index.
     */
    void inSegment(int i, int j, int previousJ, int previousI);

    /**
     * Returns whether this segment was accepted as a starting point.
     *
     * @param i         The current query term index.
     * @param j         The current field term index.
     * @param previousJ The previous field term index.
     * @return Whether this segment was accepted or not.
     */
    bool segmentStart(int i, int j, int previousJ);

    /**
     * Registers an end of a segment.
     *
     * @param i The i at which this segment ends.
     * @param j The j at which this segment ends.
     */
    void segmentEnd(int i, int j);

    /**
     * Returns the next open segment to explore, or null if no more segments exists or should be explored.
     *
     * @param The i to start searching from.
     * @return The next open segment, or null.
     */
    SegmentStart *findOpenSegment(uint32_t startI);

    /**
     * Returns the last segment start point in the internal list.
     *
     * @return The last segment start.
     */
    SegmentStart *findLastStartPoint();

    /**
     * Counts all occurrences of terms of the query in the field and set those metrics.
     *
     * @param metrics The metrics to update.
     */
    void setOccurrenceCounts(Metrics &metrics);

    void handleError(uint32_t fieldPos, uint32_t docId) const __attribute__((noinline));


private:
    using BitVectorPtr = std::shared_ptr<BitVector>;
    using TermFieldMatchDataVector = std::vector<const fef::TermFieldMatchData *>;

    struct SegmentData {
        SegmentData() : segment(), valid(false) {}
        SegmentData(SegmentStart::SP ss, bool v = false) noexcept : segment(std::move(ss)), valid(v) {}
        SegmentStart::SP segment;
        bool valid;
    };

    struct BitVectorData {
        BitVectorData() : bitvector(0), valid(false) {}
        AllocatedBitVector bitvector;
        bool valid;
    };

    // per query
    const ComputerSharedState&                 _shared_state;
    const search::fef::PhraseSplitter        & _splitter;
    uint32_t                                   _fieldId;
    const Params                               _params;
    bool                                       _useCachedHits;

    const vespalib::ConstArrayRef<QueryTerm>   _queryTerms;
    TermFieldMatchDataVector                   _queryTermFieldMatch;
    uint32_t                                   _totalTermWeight;
    feature_t                                  _totalTermSignificance;

    // per docid
    uint32_t                                   _fieldLength;
    Metrics                                    _currentMetrics; // The metrics of the currently explored segmentation.
    Metrics                                    _finalMetrics;   // The final metrics, null during and before metric computation.
    SimpleMetrics                              _simpleMetrics;  // The metrics used to compute simple features.
    std::vector<SegmentData>                   _segments;       // Known segment starting points.
    uint32_t                                   _alternativeSegmentationsTried;
    std::vector<BitVectorData>                 _cachedHits;
};

}