aboutsummaryrefslogtreecommitdiffstats
path: root/container-search/src/main/java/com/yahoo/search/querytransform/LowercasingSearcher.java
blob: 47fec584153087b5f94791bf0e10b164ea0b9e1d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.querytransform;

import com.yahoo.prelude.IndexFacts;
import com.yahoo.prelude.query.*;
import com.yahoo.search.Query;
import com.yahoo.search.Result;
import com.yahoo.search.Searcher;
import com.yahoo.search.searchchain.Execution;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import static com.yahoo.language.LinguisticsCase.toLowerCase;

/**
 * Traverse a query tree and lowercase terms based on decision made in subclasses.
 *
 * @author Steinar Knutsen
 */
public abstract class LowercasingSearcher extends Searcher {

    private final boolean transformWeightedSets;

    public LowercasingSearcher() {
        this(new LowercasingConfig(new LowercasingConfig.Builder()));
    }

    public LowercasingSearcher(LowercasingConfig cfg) {
        this.transformWeightedSets = cfg.transform_weighted_sets();
    }

    @Override
    public Result search(Query query, Execution execution) {
        IndexFacts.Session indexFacts = execution.context().getIndexFacts().newSession(query);
        traverse(query.getModel().getQueryTree(), indexFacts);
        traverseHighlight(query.getPresentation().getHighlight(), indexFacts);
        query.trace("Lowercasing", true, 2);
        return execution.search(query);
    }

    private void traverseHighlight(Highlight highlight, IndexFacts.Session indexFacts) {
        if (highlight == null) return;

        for (AndItem item : highlight.getHighlightItems().values()) {
            traverse(item, indexFacts);
        }
    }

    private void traverse(CompositeItem base, IndexFacts.Session indexFacts) {
        for (Iterator<Item> i = base.getItemIterator(); i.hasNext();) {
            Item next = i.next();
            if (next instanceof WordItem) {
                lowerCase((WordItem) next, indexFacts);
            } else if (next instanceof SameElementItem) {
                traverseSameElement((SameElementItem) next, indexFacts);
            } else if (next instanceof CompositeItem) {
                traverse((CompositeItem) next, indexFacts);
            } else if (next instanceof WeightedSetItem) {
                if (transformWeightedSets) {
                    lowerCase((WeightedSetItem) next, indexFacts);
                }
            } else if (next instanceof WordAlternativesItem) {
                lowerCase((WordAlternativesItem) next, indexFacts);
            }
        }
    }

    private void traverseSameElement(SameElementItem base, IndexFacts.Session indexFacts) {
        for (Iterator<Item> i = base.getItemIterator(); i.hasNext();) {
            Item next = i.next();
            if (next instanceof WordItem) {
               lowerCase(base.getFieldName(), (WordItem) next, indexFacts);
            }
        }
    }

    private void lowerCase(WordItem word, IndexFacts.Session indexFacts) {
        if (shouldLowercase(word, indexFacts)) {
            word.setWord(toLowerCase(word.getWord()));
            word.setLowercased(true);
        }
    }

    private void lowerCase(String commonPath, WordItem word, IndexFacts.Session indexFacts) {
        if (shouldLowercase(commonPath, word, indexFacts)) {
            word.setWord(toLowerCase(word.getWord()));
            word.setLowercased(true);
        }
    }

    private static final class WeightedSetToken {
        final String token;
        final String originalToken;
        final int weight;

        WeightedSetToken(String token, String originalToken, int weight) {
            this.token = token;
            this.originalToken = originalToken;
            this.weight = weight;
        }
    }

    private boolean syntheticLowerCaseCheck(String indexName, IndexFacts.Session indexFacts, boolean isFromQuery) {
        WordItem w = new WordItem("not-used", indexName, isFromQuery);
        return shouldLowercase(w, indexFacts);
    }

    private void lowerCase(WeightedSetItem set, IndexFacts.Session indexFacts) {
        if ( ! syntheticLowerCaseCheck(set.getIndexName(), indexFacts, true)) {
            return;
        }

        List<WeightedSetToken> terms = new ArrayList<>(set.getNumTokens());
        for (Iterator<Map.Entry<Object, Integer>> i = set.getTokens(); i.hasNext();) {
            Map.Entry<Object, Integer> e = i.next();
            if (e.getKey() instanceof String) {
                String originalToken = (String) e.getKey();
                String token = toLowerCase(originalToken);
                if ( ! originalToken.equals(token)) {
                    terms.add(new WeightedSetToken(token, originalToken, e.getValue().intValue()));
                }
            }
        }
        // has to do it in two passes on cause of the "interesting" API in
        // weighted set, and remove before put on cause of the semantics of
        // addInternal as well as changed values...
        for (WeightedSetToken t : terms) {
            set.removeToken(t.originalToken);
            set.addToken(t.token, t.weight);
        }
    }

    private void lowerCase(WordAlternativesItem alternatives, IndexFacts.Session indexFacts) {
        if (!syntheticLowerCaseCheck(alternatives.getIndexName(), indexFacts, alternatives.isFromQuery())) {
            return;
        }
        for (WordAlternativesItem.Alternative term : alternatives.getAlternatives()) {
            String lowerCased = toLowerCase(term.word);
            alternatives.addTerm(lowerCased, term.exactness * .7d);
        }

    }

    /**
     * Override this to control whether a given term should be lowercased.
     *
     * @param word a WordItem or subclass thereof which is a candidate for lowercasing
     * @return whether to convert the term to lower case
     */
    public abstract boolean shouldLowercase(WordItem word, IndexFacts.Session indexFacts);
    public abstract boolean shouldLowercase(String commonPath, WordItem word, IndexFacts.Session indexFacts);

}