aboutsummaryrefslogtreecommitdiffstats
path: root/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
blob: f6025dc6ba7ff986e7275c8536e96330ca453ccf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.significance;

import com.yahoo.component.annotation.Inject;
import com.yahoo.component.chain.dependencies.Before;
import com.yahoo.component.chain.dependencies.Provides;
import com.yahoo.language.Language;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
import com.yahoo.prelude.query.CompositeItem;
import com.yahoo.prelude.query.Item;
import com.yahoo.prelude.query.NullItem;
import com.yahoo.prelude.query.WordItem;
import com.yahoo.search.Query;
import com.yahoo.search.Result;
import com.yahoo.search.Searcher;
import com.yahoo.search.result.ErrorMessage;
import com.yahoo.search.schema.RankProfile;
import com.yahoo.search.schema.Schema;
import com.yahoo.search.schema.SchemaInfo;
import com.yahoo.search.searchchain.Execution;

import java.util.HashSet;
import java.util.Optional;
import java.util.logging.Logger;
import java.util.stream.Collectors;

import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING;

/**
 * Sets significance values on word items in the query tree.
 *
 * @author MariusArhaug
 */

@Provides(SignificanceSearcher.SIGNIFICANCE)
@Before(STEMMING)
public class SignificanceSearcher extends Searcher {

    public final static String SIGNIFICANCE = "Significance";

    private static final Logger log = Logger.getLogger(SignificanceSearcher.class.getName());

    private final SignificanceModelRegistry significanceModelRegistry;
    private final SchemaInfo schemaInfo;

    @Inject
    public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry, SchemaInfo schemaInfo) {
        this.significanceModelRegistry = significanceModelRegistry;
        this.schemaInfo = schemaInfo;
    }

    @Override
    public Result search(Query query, Execution execution) {
        var rankProfileName = query.getRanking().getProfile();

        // Determine significance setup per schema for the given rank profile
        var perSchemaSetup = schemaInfo.newSession(query).schemas().stream()
                .collect(Collectors.toMap(Schema::name, schema ->
                        // Fallback to disabled if the rank profile is not found in the schema
                        // This will result in a failure later (in a "backend searcher") anyway.
                        Optional.ofNullable(schema.rankProfiles().get(rankProfileName))
                                .map(RankProfile::useSignificanceModel).orElse(false)));
        var uniqueSetups = new HashSet<>(perSchemaSetup.values());

        // Fail if the significance setup for the selected schemas are conflicting
        if (uniqueSetups.size() > 1) {
            var result = new Result(query);
            result.hits().addError(
                    ErrorMessage.createIllegalQuery(
                            ("Inconsistent 'significance' configuration for the rank profile '%s' in the schemas %s. " +
                                    "Use 'restrict' to limit the query to a subset of schemas " +
                                    "(https://docs.vespa.ai/en/schemas.html#multiple-schemas). " +
                                    "Specify same 'significance' configuration for all selected schemas " +
                                    "(https://docs.vespa.ai/en/reference/schema-reference.html#significance).")
                            .formatted(rankProfileName, perSchemaSetup.keySet())));
            return result;
        }

        if (perSchemaSetup.isEmpty()) return execution.search(query);
        var useSignificanceModel = uniqueSetups.iterator().next();
        if (!useSignificanceModel) return execution.search(query);

        Language language = query.getModel().getParsingLanguage();
        Optional<SignificanceModel> model = significanceModelRegistry.getModel(language);

        if (model.isEmpty()) return execution.search(query);

        setIDF(query.getModel().getQueryTree().getRoot(), model.get());

        return execution.search(query);
    }

    private void setIDF(Item root, SignificanceModel significanceModel) {
        if (root == null || root instanceof NullItem) return;

        if (root instanceof WordItem) {

            var documentFrequency = significanceModel.documentFrequency(((WordItem) root).getWord());
            long N                = documentFrequency.corpusSize();
            long nq_i             = documentFrequency.frequency();
            double idf            = calculateIDF(N, nq_i);

            ((WordItem) root).setSignificance(idf);
        } else if (root instanceof CompositeItem) {
            for (int i = 0; i < ((CompositeItem) root).getItemCount(); i++) {
                setIDF(((CompositeItem) root).getItem(i), significanceModel);
            }
        }
    }

    public static double calculateIDF(long N, long nq_i) {
        return Math.log(1 + (N - nq_i + 0.5) / (nq_i + 0.5));
    }
}