diff options
Diffstat (limited to 'container-search/src/main/java/com/yahoo/prelude/querytransform/IndexCombinatorSearcher.java')
-rw-r--r-- | container-search/src/main/java/com/yahoo/prelude/querytransform/IndexCombinatorSearcher.java | 358 |
1 files changed, 358 insertions, 0 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/IndexCombinatorSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/IndexCombinatorSearcher.java new file mode 100644 index 00000000000..e56303e60f8 --- /dev/null +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/IndexCombinatorSearcher.java @@ -0,0 +1,358 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.prelude.querytransform; + +import static com.yahoo.prelude.querytransform.PhrasingSearcher.PHRASE_REPLACEMENT; + +import com.yahoo.component.chain.dependencies.After; +import com.yahoo.component.chain.dependencies.Before; +import com.yahoo.component.chain.dependencies.Provides; +import com.yahoo.log.LogLevel; +import com.yahoo.prelude.Index; +import com.yahoo.prelude.Index.Attribute; +import com.yahoo.prelude.IndexFacts; +import com.yahoo.prelude.query.*; +import com.yahoo.search.Query; +import com.yahoo.search.Searcher; +import com.yahoo.search.searchchain.Execution; +import com.yahoo.search.searchchain.PhaseNames; + +import java.util.*; + +/** + * Searcher to rewrite queries to achieve mixed recall between indices and + * memory attributes. + * + * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + */ +@After({PhaseNames.RAW_QUERY, PHRASE_REPLACEMENT}) +@Before(PhaseNames.TRANSFORMED_QUERY) +@Provides(IndexCombinatorSearcher.MIXED_RECALL_REWRITE) +// TODO: This is not necessary on Vespa 6, we should probably remove it from the default chain but keep it +// around until Vespa 6 to avoid breaking those who refer to it. +public class IndexCombinatorSearcher extends Searcher { + public static final String MIXED_RECALL_REWRITE = "MixedRecallRewrite"; + + private static class ArrayComparator implements Comparator<Attribute[]> { + /** + * Note, this ignores if there is a difference in whether to + * attributes have tokenized content. (If this is the case, + * we are having worse problems anyway.) + */ + public int compare(Attribute[] o1, Attribute[] o2 ) { + if (o1.length < o2.length) { + return -1; + } else if (o1.length > o2.length) { + return 1; + } + int limit = o1.length; + for (int i = 0; i < limit; ++i) { + int r = o1[i].name.compareTo(o2[i].name); + if (r != 0) { + return r; + } + } + return 0; + } + } + + private final ArrayComparator comparator = new ArrayComparator(); + + private enum RewriteStrategies { + NONE, CHEAP_AND, EXPENSIVE_AND, FLAT + } + + @Override + public com.yahoo.search.Result search(Query query, Execution execution) { + Item root = query.getModel().getQueryTree().getRoot(); + IndexFacts.Session session = execution.context().getIndexFacts().newSession(query); + String oldQuery = (query.getTraceLevel() >= 2) ? root.toString() : ""; + + if (root instanceof BlockItem || root instanceof PhraseItem) { + root = convertSinglePhraseOrBlock(root, session); + } else if (root instanceof CompositeItem) { + root = rewrite((CompositeItem) root, session); + } + query.getModel().getQueryTree().setRoot(root); + + if ((query.getTraceLevel() >= 2) && !(oldQuery.equals(root.toString()))) { + query.trace("Rewriting for mixed recall between indices and attributes", true, 2); + } + return execution.search(query); + } + + private RewriteStrategies chooseRewriteStrategy(CompositeItem c, IndexFacts.Session session) { + if (c instanceof OrItem) { + return RewriteStrategies.FLAT; + } else if (!(c instanceof AndItem)) { + return RewriteStrategies.NONE; + } + Map<Attribute[], Integer> m = new TreeMap<>(comparator); + for (Iterator<Item> i = c.getItemIterator(); i.hasNext();) { + Item j = i.next(); + if (j instanceof BlockItem || j instanceof PhraseItem) { + Attribute[] attributes= getIndices((HasIndexItem) j, session); + if (attributes == null) { + continue; + } + Integer count = m.get(attributes); + if (count == null) { + count = 1; + } else { + count = count.intValue() + 1; + } + m.put(attributes, count); + } + } + + if (m.size() == 0) { + return RewriteStrategies.NONE; + } + + int singles = 0; + int pairs = 0; + int higher = 0; + // count the number of sets being associated with 1, 2 or more terms + for (Integer i : m.values()) { + switch (i.intValue()) { + case 1: + ++singles; + break; + case 2: + pairs += 2; + break; + default: + ++higher; + break; + } + } + if (higher == 0 && pairs + singles <= 2) { + return RewriteStrategies.EXPENSIVE_AND; + } else { + return RewriteStrategies.CHEAP_AND; + } + } + + private CompositeItem rewriteNot(NotItem not, IndexFacts.Session session) { + Item positive = not.getItem(0); + if (positive instanceof BlockItem || positive instanceof PhraseItem) { + positive = convertSinglePhraseOrBlock(positive, session); + not.setItem(0, positive); + } else if (positive instanceof CompositeItem) { + CompositeItem c = (CompositeItem) positive; + positive = rewrite(c, session); + not.setItem(0, positive); + } + + int length = not.getItemCount(); + // no need for keeping proximity in the negative branches, so we + // convert them one by one, _and_ always uses cheap transform + for (int i = 1; i < length; ++i) { + Item exclusion = not.getItem(i); + if (exclusion instanceof BlockItem || exclusion instanceof PhraseItem) { + exclusion = convertSinglePhraseOrBlock(exclusion, session); + not.setItem(i, exclusion); + } else if (exclusion instanceof CompositeItem) { + CompositeItem c = (CompositeItem) exclusion; + switch (chooseRewriteStrategy(c, session)) { + case NONE: + c = traverse(c, session); + break; + case CHEAP_AND: + case EXPENSIVE_AND: + c = cheapTransform(c, session); + break; + default: + c = flatTransform(c, session); + break; + } + not.setItem(i, c); + } + } + return not; + } + + private Item rewrite(CompositeItem c, IndexFacts.Session session) { + if (c instanceof NotItem) { + c = rewriteNot((NotItem) c, session); + return c; + } else if (c instanceof CompositeItem) { + switch (chooseRewriteStrategy(c, session)) { + case NONE: + c = traverse(c, session); + break; + case CHEAP_AND: + c = cheapTransform(c, session); + break; + case EXPENSIVE_AND: + c = expensiveTransform((AndItem) c, session); + break; + case FLAT: + c = flatTransform(c, session); + default: + break; + } + } + return c; + } + + private CompositeItem traverse(CompositeItem c, IndexFacts.Session session) { + int length = c.getItemCount(); + for (int i = 0; i < length; ++i) { + Item word = c.getItem(i); + if (word instanceof CompositeItem && !(word instanceof PhraseItem) + && !(word instanceof BlockItem)) { + c.setItem(i, rewrite((CompositeItem) word, session)); + } + } + return c; + } + + private CompositeItem expensiveTransform(AndItem c, IndexFacts.Session session) { + int[] indices = new int[2]; + int items = 0; + int length = c.getItemCount(); + Attribute[][] names = new Attribute[2][]; + CompositeItem result = null; + for (int i = 0; i < length; ++i) { + Item word = c.getItem(i); + if (word instanceof BlockItem || word instanceof PhraseItem) { + Attribute[] attributes = getIndices((HasIndexItem) word, session); + if (attributes == null) { + continue; + } + // this throwing an out of bounds if more than two candidates is intentional + names[items] = attributes; + indices[items++] = i; + } else if (word instanceof CompositeItem) { + c.setItem(i, rewrite((CompositeItem) word, session)); + } + } + switch (items) { + case 1: + result = linearAnd(c, names[0], indices[0]); + break; + case 2: + result = quadraticAnd(c, names[0], names[1], indices[0], indices[1]); + break; + default: + // should never happen + getLogger().log( + LogLevel.WARNING, + "Unexpected number of items for mixed recall, got " + items + + ", expected 1 or 2."); + break; + } + return result; + } + + private Attribute[] getIndices(HasIndexItem block, IndexFacts.Session session) { + return session.getIndex(block.getIndexName()).getMatchGroup(); + } + + private OrItem linearAnd(AndItem c, Attribute[] names, int brancherIndex) { + OrItem or = new OrItem(); + for (int i = 0; i < names.length; ++i) { + AndItem duck = (AndItem) c.clone(); + Item b = retarget(duck.getItem(brancherIndex), names[i]); + duck.setItem(brancherIndex, b); + or.addItem(duck); + } + return or; + } + + private OrItem quadraticAnd(AndItem c, Attribute[] firstNames, Attribute[] secondNames, int firstBrancher, int secondBrancher) { + OrItem or = new OrItem(); + for (int i = 0; i < firstNames.length; ++i) { + for (int j = 0; j < secondNames.length; ++j) { + AndItem duck = (AndItem) c.clone(); + Item b = retarget(duck.getItem(firstBrancher), firstNames[i]); + duck.setItem(firstBrancher, b); + b = retarget(duck.getItem(secondBrancher), secondNames[j]); + duck.setItem(secondBrancher, b); + or.addItem(duck); + } + } + return or; + } + + private CompositeItem flatTransform(CompositeItem c, IndexFacts.Session session) { + int maxIndex = c.getItemCount() - 1; + for (int i = maxIndex; i >= 0; --i) { + Item word = c.getItem(i); + if (word instanceof BlockItem || word instanceof PhraseItem) { + Attribute[] attributes = getIndices((HasIndexItem) word, session); + if (attributes == null) { + continue; + } + c.removeItem(i); + for (Attribute name : attributes) { + Item term = word.clone(); + Item forNewIndex = retarget(term, name); + c.addItem(forNewIndex); + } + } else if (word instanceof CompositeItem) { + c.setItem(i, rewrite((CompositeItem) word, session)); + } + } + return c; + } + + private CompositeItem cheapTransform(CompositeItem c, IndexFacts.Session session) { + if (c instanceof OrItem) { + return flatTransform(c, session); + } + int length = c.getItemCount(); + for (int i = 0; i < length; ++i) { + Item j = c.getItem(i); + if (j instanceof BlockItem || j instanceof PhraseItem) { + Attribute[] attributes = getIndices((HasIndexItem) j, session); + if (attributes == null) { + continue; + } + CompositeItem or = searchAllForItem(j, attributes); + c.setItem(i, or); + } else if (j instanceof CompositeItem) { + c.setItem(i, rewrite((CompositeItem) j, session)); + } + } + return c; + } + + private OrItem searchAllForItem(Item word, Attribute[] attributes) { + OrItem or = new OrItem(); + for (Attribute name : attributes) { + Item term = word.clone(); + term = retarget(term, name); + or.addItem(term); + } + return or; + } + + private Item retarget(Item word, Attribute newIndex) { + if (word instanceof PhraseItem && !newIndex.isTokenizedContent()) { + PhraseItem asPhrase = (PhraseItem) word; + WordItem newWord = new WordItem(asPhrase.getIndexedString(), newIndex.name, false); + return newWord; + } else if (word instanceof IndexedItem) { + word.setIndexName(newIndex.name); + } else if (word instanceof CompositeItem) { + CompositeItem asComposite = (CompositeItem) word; + for (Iterator<Item> i = asComposite.getItemIterator(); i.hasNext();) { + Item segment = i.next(); + segment.setIndexName(newIndex.name); + } + } + return word; + } + + private Item convertSinglePhraseOrBlock(Item item, IndexFacts.Session session) { + Item newItem; + Attribute[] attributes = getIndices((HasIndexItem) item, session); + if (attributes == null) { + return item; + } + newItem = searchAllForItem(item, attributes); + return newItem; + } + +} |