diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /container-search/src/main/java/com/yahoo/search/querytransform/LowercasingSearcher.java |
Publish
Diffstat (limited to 'container-search/src/main/java/com/yahoo/search/querytransform/LowercasingSearcher.java')
-rw-r--r-- | container-search/src/main/java/com/yahoo/search/querytransform/LowercasingSearcher.java | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/querytransform/LowercasingSearcher.java b/container-search/src/main/java/com/yahoo/search/querytransform/LowercasingSearcher.java new file mode 100644 index 00000000000..d3916c4bfe1 --- /dev/null +++ b/container-search/src/main/java/com/yahoo/search/querytransform/LowercasingSearcher.java @@ -0,0 +1,140 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.search.querytransform; + +import com.yahoo.prelude.IndexFacts; +import com.yahoo.prelude.query.*; +import com.yahoo.search.Query; +import com.yahoo.search.Result; +import com.yahoo.search.Searcher; +import com.yahoo.search.searchchain.Execution; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static com.yahoo.language.LinguisticsCase.toLowerCase; + +/** + * Traverse a query tree and lowercase terms based on decision made in subclasses. + * + * @since 5.1.3 + * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + */ +public abstract class LowercasingSearcher extends Searcher { + + private final boolean transformWeightedSets; + + public LowercasingSearcher() { + this(new LowercasingConfig(new LowercasingConfig.Builder())); + } + + public LowercasingSearcher(LowercasingConfig cfg) { + this.transformWeightedSets = cfg.transform_weighted_sets(); + } + + @Override + public Result search(Query query, Execution execution) { + IndexFacts.Session indexFacts = execution.context().getIndexFacts().newSession(query); + traverse(query.getModel().getQueryTree(), indexFacts); + traverseHighlight(query.getPresentation().getHighlight(), indexFacts); + query.trace("Lowercasing", true, 2); + return execution.search(query); + } + + private void traverseHighlight(Highlight highlight, IndexFacts.Session indexFacts) { + if (highlight == null) return; + + for (AndItem item : highlight.getHighlightItems().values()) { + traverse(item, indexFacts); + } + } + + private void traverse(CompositeItem base, IndexFacts.Session indexFacts) { + for (Iterator<Item> i = base.getItemIterator(); i.hasNext();) { + Item next = i.next(); + if (next instanceof WordItem) { + lowerCase((WordItem) next, indexFacts); + } else if (next instanceof CompositeItem) { + traverse((CompositeItem) next, indexFacts); + } else if (next instanceof WeightedSetItem) { + if (transformWeightedSets) { + lowerCase((WeightedSetItem) next, indexFacts); + } + } else if (next instanceof WordAlternativesItem) { + lowerCase((WordAlternativesItem) next, indexFacts); + } + } + } + + private void lowerCase(WordItem word, IndexFacts.Session indexFacts) { + if (shouldLowercase(word, indexFacts)) { + word.setWord(toLowerCase(word.getWord())); + word.setLowercased(true); + } + } + + private static final class WeightedSetToken { + final String token; + final String originalToken; + final int weight; + + WeightedSetToken(String token, String originalToken, int weight) { + this.token = token; + this.originalToken = originalToken; + this.weight = weight; + } + } + + private boolean syntheticLowerCaseCheck(String indexName, IndexFacts.Session indexFacts, boolean isFromQuery) { + WordItem w = new WordItem("", indexName, isFromQuery); + return shouldLowercase(w, indexFacts); + } + + private void lowerCase(WeightedSetItem set, IndexFacts.Session indexFacts) { + if (!syntheticLowerCaseCheck(set.getIndexName(), indexFacts, true)) { + return; + } + + List<WeightedSetToken> terms = new ArrayList<>(set.getNumTokens()); + for (Iterator<Map.Entry<Object, Integer>> i = set.getTokens(); i.hasNext();) { + Map.Entry<Object, Integer> e = i.next(); + if (e.getKey() instanceof String) { + String originalToken = (String) e.getKey(); + String token = toLowerCase(originalToken); + if ( ! originalToken.equals(token)) { + terms.add(new WeightedSetToken(token, originalToken, e.getValue().intValue())); + } + } + } + // has to do it in two passes on cause of the "interesting" API in + // weighted set, and remove before put on cause of the semantics of + // addInternal as well as changed values... + for (WeightedSetToken t : terms) { + set.removeToken(t.originalToken); + set.addToken(t.token, t.weight); + } + } + + private void lowerCase(WordAlternativesItem alternatives, IndexFacts.Session indexFacts) { + if (!syntheticLowerCaseCheck(alternatives.getIndexName(), indexFacts, alternatives.isFromQuery())) { + return; + } + for (WordAlternativesItem.Alternative term : alternatives.getAlternatives()) { + String lowerCased = toLowerCase(term.word); + alternatives.addTerm(lowerCased, term.exactness * .7d); + } + + } + + /** + * Override this to control whether a given term should be lowercased. + * + * @param word a WordItem or subclass thereof which is a candidate for lowercasing + * @return whether to convert the term to lower case + */ + public abstract boolean shouldLowercase(WordItem word, IndexFacts.Session indexFacts); + +} |