blob: f6025dc6ba7ff986e7275c8536e96330ca453ccf (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.significance;
import com.yahoo.component.annotation.Inject;
import com.yahoo.component.chain.dependencies.Before;
import com.yahoo.component.chain.dependencies.Provides;
import com.yahoo.language.Language;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
import com.yahoo.prelude.query.CompositeItem;
import com.yahoo.prelude.query.Item;
import com.yahoo.prelude.query.NullItem;
import com.yahoo.prelude.query.WordItem;
import com.yahoo.search.Query;
import com.yahoo.search.Result;
import com.yahoo.search.Searcher;
import com.yahoo.search.result.ErrorMessage;
import com.yahoo.search.schema.RankProfile;
import com.yahoo.search.schema.Schema;
import com.yahoo.search.schema.SchemaInfo;
import com.yahoo.search.searchchain.Execution;
import java.util.HashSet;
import java.util.Optional;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING;
/**
* Sets significance values on word items in the query tree.
*
* @author MariusArhaug
*/
@Provides(SignificanceSearcher.SIGNIFICANCE)
@Before(STEMMING)
public class SignificanceSearcher extends Searcher {
public final static String SIGNIFICANCE = "Significance";
private static final Logger log = Logger.getLogger(SignificanceSearcher.class.getName());
private final SignificanceModelRegistry significanceModelRegistry;
private final SchemaInfo schemaInfo;
@Inject
public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry, SchemaInfo schemaInfo) {
this.significanceModelRegistry = significanceModelRegistry;
this.schemaInfo = schemaInfo;
}
@Override
public Result search(Query query, Execution execution) {
var rankProfileName = query.getRanking().getProfile();
// Determine significance setup per schema for the given rank profile
var perSchemaSetup = schemaInfo.newSession(query).schemas().stream()
.collect(Collectors.toMap(Schema::name, schema ->
// Fallback to disabled if the rank profile is not found in the schema
// This will result in a failure later (in a "backend searcher") anyway.
Optional.ofNullable(schema.rankProfiles().get(rankProfileName))
.map(RankProfile::useSignificanceModel).orElse(false)));
var uniqueSetups = new HashSet<>(perSchemaSetup.values());
// Fail if the significance setup for the selected schemas are conflicting
if (uniqueSetups.size() > 1) {
var result = new Result(query);
result.hits().addError(
ErrorMessage.createIllegalQuery(
("Inconsistent 'significance' configuration for the rank profile '%s' in the schemas %s. " +
"Use 'restrict' to limit the query to a subset of schemas " +
"(https://docs.vespa.ai/en/schemas.html#multiple-schemas). " +
"Specify same 'significance' configuration for all selected schemas " +
"(https://docs.vespa.ai/en/reference/schema-reference.html#significance).")
.formatted(rankProfileName, perSchemaSetup.keySet())));
return result;
}
if (perSchemaSetup.isEmpty()) return execution.search(query);
var useSignificanceModel = uniqueSetups.iterator().next();
if (!useSignificanceModel) return execution.search(query);
Language language = query.getModel().getParsingLanguage();
Optional<SignificanceModel> model = significanceModelRegistry.getModel(language);
if (model.isEmpty()) return execution.search(query);
setIDF(query.getModel().getQueryTree().getRoot(), model.get());
return execution.search(query);
}
private void setIDF(Item root, SignificanceModel significanceModel) {
if (root == null || root instanceof NullItem) return;
if (root instanceof WordItem) {
var documentFrequency = significanceModel.documentFrequency(((WordItem) root).getWord());
long N = documentFrequency.corpusSize();
long nq_i = documentFrequency.frequency();
double idf = calculateIDF(N, nq_i);
((WordItem) root).setSignificance(idf);
} else if (root instanceof CompositeItem) {
for (int i = 0; i < ((CompositeItem) root).getItemCount(); i++) {
setIDF(((CompositeItem) root).getItem(i), significanceModel);
}
}
}
public static double calculateIDF(long N, long nq_i) {
return Math.log(1 + (N - nq_i + 0.5) / (nq_i + 0.5));
}
}
|