aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/main/java/com/yahoo/searchlib/ranking/features/fieldmatch/FieldMatchMetrics.java
blob: 2725165d527977cadf64d006bde20d45ef2f7d6a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.searchlib.ranking.features.fieldmatch;

import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import static java.lang.Math.*;

/**
 * The collection of metrics calculated by the string match metric calculator.
 *
 * @author  bratseth
 */
public final class FieldMatchMetrics implements Cloneable {

    /** The calculator creating this - given on initialization */
    private final FieldMatchMetricsComputer source;

    /** The trace accumulated during execution - empty if no tracing */
    private final Trace trace = new Trace();

    private boolean complete;

    // Metrics
    private int outOfOrder;
    private int segments;
    private int gaps;
    private int gapLength;
    private int longestSequence;
    private int head;
    private int tail;
    private int matches;
    private float proximity;
    private float unweightedProximity;
    private float segmentDistance;
    private int pairs;
    private float weight;
    private float significance;
    private float occurrence;
    private float weightedOccurrence;
    private float absoluteOccurrence;
    private float weightedAbsoluteOccurrence;
    private float significantOccurrence;
    private float weightedExactnessSum;
    private int weightSum;

    // Temporary variables
    private int currentSequence;
    private List<Integer> segmentStarts=new ArrayList<>();
    private int queryLength;

    public FieldMatchMetrics(FieldMatchMetricsComputer source) {
        this.source=source;

        complete=false;

        outOfOrder = 0;
        segments = 0;
        gaps = 0;
        gapLength = 0;
        longestSequence = 1;
        head = -1;
        tail = -1;
        proximity = 0;
        unweightedProximity = 0;
        segmentDistance = 0;
        matches = 0;
        pairs = 0;
        weight = 0;
        significance = 0;
        weightedExactnessSum = 0;
        weightSum = 0;

        currentSequence=0;
        segmentStarts.clear();
        queryLength = source.getQuery().getTerms().length;
    }

    /** Are these metrics representing a complete match */
    public boolean isComplete() { return complete; }

    public void setComplete(boolean complete) { this.complete=complete; }

    /** Returns the segment start points */
    public List<Integer> getSegmentStarts() { return segmentStarts; }

    /**
     * Returns a metric by name
     *
     * @throws IllegalArgumentException if the metric name (case sensitive) is not present
     */
    public float get(String name) {
        try {
            Method getter = getClass().getMethod("get" + name.substring(0, 1).toUpperCase() + name.substring(1));
            return ((Number)getter.invoke(this)).floatValue();
        }
        catch (NoSuchMethodException e) {
            throw new IllegalArgumentException("No metric named '" + name + "' is known");
        }
        catch (Exception e) {
            throw new RuntimeException("Error getting metric '" + name + "'",e);
        }
    }

    // Base metrics ----------------------------------------------------------------------------------------------

    /** Returns the total number of out of order token sequences within field segments */
    public int getOutOfOrder() { return outOfOrder; }

    /** Returns the number of field text segments which are needed to match the query as completely as possible */
    public int getSegments() { return segments; }

    /** Returns the total number of position jumps (backward or forward) within document segments */
    public int getGaps() { return gaps; }

    /** Returns the summed size of all gaps within segments */
    public int getGapLength() { return gapLength; }

    /** Returns the size of the longest matched continuous, in-order sequence in the document */
    public int getLongestSequence() { return longestSequence; }

    /** Returns the number of tokens in the field preceding the start of the first matched segment */
    public int getHead() { return head; }

    /** Returns the number of tokens in the field following the end of the last matched segment */
    public int getTail() { return tail; }

    /** Returns the number of query terms which was matched in this field */
    public int getMatches() { return matches; }

    /** Returns the number of in-segment token pairs */
    public int getPairs() { return pairs; }

    /**
     * Returns the normalized proximity of the matched terms, weighted by the connectedness of the query terms.
     * This number is 0.1 if all the matched terms are and have default or lower connectedness, close to 1 if they
     * are following in sequence and have a high connectedness, and close to 0 if they are far from each other in the
     * segment or out of order
     */
    public float getAbsoluteProximity() {
        if (pairs < 1) return 0.1f;

        return proximity/pairs;
    }

    /**
     * Returns the normalized proximity of the matched terms, not taking term connectedness into account.
     * This number is close to 1 if all the matched terms are
     * following each other in sequence, and close to 0 if they are far from each other or out of order
     */
    public float getUnweightedProximity() {
        if (pairs < 1) return 1f;
        return unweightedProximity/pairs;
    }

    /**
     * Returns the sum of the distance between all segments making up a match to the query, measured
     * as the sum of the number of token positions separating the <i>start</i> of each field adjacent segment.
     */
    public float getSegmentDistance() { return segmentDistance; }

    /**
     * <p>Returns the normalized weight of this match relative to the whole query:
     * The sum of the weights of all <i>matched</i> terms/the sum of the weights of all <i>query</i> terms
     * If all the query terms were matched, this is 1. If no terms were matched, or these matches has weight zero,
     * this is 0.</p>
     *
     * <p>As the sum of this number over all the terms of the query is always 1, sums over all fields of
     * normalized rank features for each field multiplied by this number for the same field will produce a
     * normalized number.</p>
     *
     * <p>Note that this scales with the number of matched query terms in the field. If you want a component which does
     * not, divide by matches.</p>
     */
    public float getWeight() { return weight; }

    /**
     * <p>Returns the normalized term significance (1-frequency) of the terms of this match relative to the whole query:
     * The sum of the significance of all <i>matched</i> terms/the sum of the significance of all <i>query</i> terms
     * If all the query terms were matched, this is 1. If no terms were matched, or if the significance of all the matched terms
     * is zero (they are present in all (possible) documents), this number is zero.</p>
     *
     * <p>As the sum of this number over all the terms of the query is always 1, sums over all fields of
     * normalized rank features for each field multiplied by this number for the same field will produce a
     * normalized number.</p>
     *
     * <p>Note that this scales with the number of matched query terms in the field. If you want a component which does
     * not, divide by matches.</p>
     */
    public float getSignificance() { return significance; }

    /**
     * <p>Returns a normalized measure of the number of occurrence of the terms of the query.
     * This number is 1 if there are many occurrences of the query terms <i>in absolute terms,
     * or relative to the total content of the field</i>, and 0 if there are none.</p>
     *
     * <p>This is suitable for occurrence in fields containing regular text.</p>
     */
    public float getOccurrence() { return occurrence; }

    /**
     * <p>Returns a normalized measure of the number of occurrence of the terms of the query:
     *
     * <code>sum over all query terms(min(number of occurrences of the term,maxOccurrences))/(query term count*100)</code>
     *
     * <p>This number is 1 if there are many occurrences of the query terms, and 0 if there are none.
     * This number does not take the actual length of the field into account, so it is suitable for uses of occurrence
     * to denote importance across multiple terms.</p>
     */
    public float getAbsoluteOccurrence() { return absoluteOccurrence; }

    /**
     * <p>Returns a normalized measure of the number of occurrence of the terms of the query, weighted by term weight.
     * This number is close to 1 if there are many occurrences of highly weighted query terms,
     * in absolute terms, or relative to the total content of the field, and 0 if there are none.</p>
     */
    public float getWeightedOccurrence() { return weightedOccurrence; }

    /**
     * <p>Returns a normalized measure of the number of occurrence of the terms of the query, taking weights
     * into account so that occurrences of higher weighted query terms has more impact than lower weighted terms.</p>
     *
     * <p>This number is 1 if there are many occurrences of the highly weighted terms, and 0 if there are none.
     * This number does not take the actual length of the field into account, so it is suitable for uses of occurrence
     * to denote importance across multiple terms.</p>
     */
    public float getWeightedAbsoluteOccurrence() { return weightedAbsoluteOccurrence; }

    /**
     * <p>Returns a normalized measure of the number of occurrence of the terms of the query
     * <i>in absolute terms,
     * or relative to the total content of the field</i>, weighted by term significance.
     *
     * <p>This number is 1 if there are many occurrences of the highly significant terms, and 0 if there are none.</p>
     */
    public float getSignificantOccurrence() { return significantOccurrence; }

    /**
     * <p>Returns the degree to which the query terms submitted matched exactly terms contained in the document.
     * This is 1 if all the terms matched exactly, and closer to 0 as more of the terms was matched only as stem forms.
     * </p>
     *
     * <p>This is the query term weighted average of the exactness of each match, where the exactness of a match is
     * the product of the exactness of the matching query term and the matching field term:
     * <code>
     * sum over matching query terms(query term weight * query term exactness * field term exactness) /
     * sum over matching query terms(query term weight)
     * </code>
     */
    public float getExactness() {
        if (matches == 0) return 0;
        return weightedExactnessSum / weightSum;
    }

    // Derived metrics ----------------------------------------------------------------------------------------------

    /** The ratio of query tokens which was matched in the field: <code>matches/queryLength</code> */
    public float getQueryCompleteness() {
        return (float)matches/source.getQuery().getTerms().length;
    }

    /** The ratio of query tokens which was matched in the field: <code>matches/fieldLength</code> */
    public float getFieldCompleteness() {
        return (float)matches/source.getField().terms().size();
    }

    /**
     * Total completeness, where field completeness is more important:
     * <code>queryCompleteness * ( 1 - fieldCompletenessImportance) + fieldCompletenessImportance * fieldCompleteness</code>
     */
    public float getCompleteness() {
        float fieldCompletenessImportance = source.getParameters().getFieldCompletenessImportance();
        return getQueryCompleteness() * ( 1 - fieldCompletenessImportance) + fieldCompletenessImportance*getFieldCompleteness();
    }

    /** Returns how well the order of the terms agreed in segments: <code>1-outOfOrder/pairs</code> */
    public float getOrderness() {
        if (pairs == 0) return 1f;
        return 1-(float)outOfOrder/pairs;
    }

    /** Returns the degree to which different terms are related (occurring in the same segment): <code>1-segments/(matches-1)</code> */
    public float getRelatedness() {
        if (matches == 0) return 0;
        if (matches == 1) return 1;
        return 1 - (float)(segments - 1) / (matches - 1);
    }

    /** Returns <code>longestSequence/matches</code> */
    public float getLongestSequenceRatio() {
        if (matches == 0) return 0;
        return (float)longestSequence / matches;
    }

    /** Returns the closeness of the segments in the field: <code>1-segmentDistance/fieldLength</code> */
    public float getSegmentProximity() {
        if (matches == 0) return 0;
        return 1 - segmentDistance / source.getField().terms().size();
    }

    /**
     * Returns a value which is close to 1 when matched terms are close and close to zero when they are far apart
     * in the segment. Relatively more connected terms influence this value more.
     * This is absoluteProximity/average connectedness.
     */
    public float getProximity() {
        float totalConnectedness = 0;
        for (int i = 1; i < queryLength; i++) {
            totalConnectedness += (float)Math.max(0.1, source.getQuery().getTerms()[i].getConnectedness());
        }
        float averageConnectedness = 0.1f;
        if (queryLength > 1)
            averageConnectedness = totalConnectedness / (queryLength - 1);
        return getAbsoluteProximity() / averageConnectedness;
    }

    /**
     * <p>Returns the average of significance and weight.</p>
     *
     * <p>As the sum of this number over all the terms of the query is always 1, sums over all fields of
     * normalized rank features for each field multiplied by this number for the same field will produce a
     * normalized number.</p>
     *
     * <p>Note that this scales with the number of matched query terms in the field. If you want a component which does
     * not, divide by matches.</p>
     */
    public float getImportance() {
        return (getSignificance() + getWeight()) / 2;
    }

    /** A normalized measure of how early the first segment occurs in this field: <code>1-head/(max(6,field.length)-1)</code> */
    public float getEarliness() {
        if (matches == 0) return 0; // Covers field.length==0 too
        if (source.getField().terms().size() == 1) return 1;
        return 1 - (float)head/(max(6, source.getField().terms().size()) - 1);
    }

    /**
     * <p>A ready-to-use aggregate match score. Use this if you don't have time to find a better application specific
     * aggregate score of the fine grained match metrics.</p>
     *
     * <p>The current formula is
     *
     * <code>
     * ( proximityCompletenessImportance * (1-relatednessImportance + relatednessImportance*relatedness)
     *               proximity * exactness * completeness^2 + earlinessImportance * earliness + segmentProximityImportance * segmentProximity )
     * / (proximityCompletenessImportance + earlinessImportance + relatednessImportance)</code>
     *
     * but this is subject to change (i.e improvement) at any time.
     * </p>
     *
     *
     * <p>Weight and significance are not taken into account because this is meant to capture tha quality of the
     * match in this field, while those measures relate this match to matches in other fields. This number
     * can be multiplied with those values when combining with other field match scores.</p>
     */
    public float getMatch() {
        float proximityCompletenessImportance = source.getParameters().getProximityCompletenessImportance();
        float earlinessImportance = source.getParameters().getEarlinessImportance();
        float relatednessImportance = source.getParameters().getRelatednessImportance();
        float segmentProximityImportance = source.getParameters().getSegmentProximityImportance();
        float occurrenceImportance = source.getParameters().getOccurrenceImportance();
        float scaledRelatedness = 1 - relatednessImportance + relatednessImportance*getRelatedness();

        return ( proximityCompletenessImportance * scaledRelatedness * getProximity() * getExactness() * getCompleteness() * getCompleteness()
                 + earlinessImportance * getEarliness()
                 + segmentProximityImportance * getSegmentProximity()
                 + occurrenceImportance * getOccurrence())
               / (proximityCompletenessImportance + earlinessImportance + segmentProximityImportance + occurrenceImportance);
    }

    /**
     * <p>The metric use to select the best segments during execution of the string match metric algorithm.</p>
     *
     * <p>This metric, and any metric it depends on, must be correct each time a segment is completed,
     * not only when the metrics are complete, because this metric is used to choose segments during calculation.</p>
     */
    float getSegmentationScore() {
        if (segments == 0) return 0;
        return getAbsoluteProximity() * getExactness() / (segments * segments);
    }

    // Events emitted from the computer while matching strings  ----------------------------------------------------
    // Note that one move in the computer may cause multiple events

    // Events on single positions ----------

    /** Called once for every match */
    void onMatch(int i, int j) {
        if (matches >= source.getField().terms().size()) return;
        matches++;
        weight += (float)source.getQuery().getTerms()[i].getWeight() / source.getQuery().getTotalTermWeight();
        significance += source.getQuery().getTerms()[i].getSignificance() / source.getQuery().getTotalSignificance();
        int queryTermWeight = source.getQuery().getTerms()[i].getWeight();
        weightedExactnessSum += queryTermWeight * source.getQuery().getTerms()[i].getExactness() * source.getField().terms().get(j).exactness();
        weightSum += queryTermWeight;
    }

    /** Called once per sequence, when the sequence starts */
    void onSequenceStart(int j) {
        if (head==-1 || j<head)
            head=j;

        currentSequence=1;
    }

    /** Called once per sequence when the sequence ends */
    void onSequenceEnd(int j) {
        int sequenceTail = source.getField().terms().size() - j - 1;
        if (tail ==-1 || sequenceTail < tail)
            tail = sequenceTail;

        if (currentSequence > longestSequence)
            longestSequence = currentSequence;
        currentSequence = 0;
    }

    /** Called once when this value is calculated, before onComplete */
    void setOccurrence(float occurrence) { this.occurrence = occurrence; }

    /** Called once when this value is calculated, before onComplete */
    void setWeightedOccurrence(float weightedOccurrence) { this.weightedOccurrence = weightedOccurrence; }

    /** Called once when this value is calculated, before onComplete */
    void setAbsoluteOccurrence(float absoluteOccurrence) { this.absoluteOccurrence = absoluteOccurrence; }

    /** Called once when this value is calculated, before onComplete */
    void setWeightedAbsoluteOccurrence(float weightedAbsoluteOccurrence) { this.weightedAbsoluteOccurrence = weightedAbsoluteOccurrence; }

    /** Called once when this value is calculated, before onComplete */
    void setSignificantOccurrence(float significantOccurrence) { this.significantOccurrence = significantOccurrence; }

    /** Called once when matching is complete */
    void onComplete() {
        // segment distance - calculated from sorted segment starts
        if (segmentStarts.size() <= 1) {
            segmentDistance = 0;
        }
        else {
            Collections.sort(segmentStarts);
            for (int i = 1; i < segmentStarts.size(); i++) {
                segmentDistance += segmentStarts.get(i) - segmentStarts.get(i - 1) + 1;
            }
        }

        if (head == -1) head = 0;
        if (tail == -1) tail = 0;
    }

    // Events on pairs ----------

    /** Called when <i>any</i> pair is encountered */
    void onPair(int i, int j, int previousJ) {
        int distance = j - previousJ - 1;
        if (distance < 0) distance++; // Discontinuity where the two terms are in the same position
        if (abs(distance) > source.getParameters().getProximityLimit()) return; // Contribution=0

        // We have an in-segment pair
        float pairProximity = source.getParameters().getProximity(distance + source.getParameters().getProximityLimit());

        unweightedProximity += pairProximity;

        float connectedness = source.getQuery().getTerms()[i].getConnectedness();
        proximity += (float)pow(pairProximity, connectedness / 0.1) * (float)max(0.1, connectedness);

        pairs++;
    }

    /** Called when an in-sequence pair is encountered */
    void onInSequence(int i, int j, int previousJ) {
        currentSequence++;
    }

    /** Called when a gap (within a sequence) is encountered */
    void onInSegmentGap(int i, int j, int previousJ) {
        gaps++;
        if (j>previousJ) {
            gapLength+=abs(j-previousJ)-1; // gap length may be 0 if the gap was in the query
        }
        else {
            outOfOrder++;
            gapLength+=abs(j-previousJ);
        }
    }

    /**
     * Called when a new segment is started
     *
     * @param previousJ the end of the previous segment, or -1 if this is the first segment
     * */
    void onNewSegment(int i, int j, int previousJ) {
        segments++;
        segmentStarts.add(j);
    }

    @Override
    public FieldMatchMetrics clone() {
        try {
            FieldMatchMetrics clone = (FieldMatchMetrics)super.clone();
            clone.segmentStarts = new ArrayList<>(segmentStarts);
            return clone;
        }
        catch (CloneNotSupportedException e) {
            throw new RuntimeException("Programming error",e);
        }
    }

    @Override
    public String toString() {
        return "Metrics: [match: " + getMatch() + "]";
    }

    public String toStringDump() {
        try {
            StringBuilder b = new StringBuilder();
            for (Method m : this.getClass().getDeclaredMethods()) {
                if ( ! m.getName().startsWith("get")) continue;
                if (m.getReturnType() != Integer.TYPE && m.getReturnType() != Float.TYPE) continue;
                if ( m.getParameterTypes().length != 0 ) continue;

                Object value = m.invoke(this, new Object[0]);
                b.append(m.getName().substring(3, 4).toLowerCase() + m.getName().substring(4) + ": " + value + "\n");
            }
            return b.toString();
        }
        catch (Exception e) {
            throw new RuntimeException("Programming error", e);
        }
    }

    /** Returns the trace of this computation. This is empty (never null) if tracing is off */
    public Trace trace() { return trace; }

}