1 files changed, 285 insertions, 0 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/querytransform/NGramSearcher.java b/container-search/src/main/java/com/yahoo/search/querytransform/NGramSearcher.java
new file mode 100644
index 00000000000..c487182c65d
--- /dev/null
+++ b/container-search/src/main/java/com/yahoo/search/querytransform/NGramSearcher.java
@@ -0,0 +1,285 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.search.querytransform;
+
+import com.yahoo.component.chain.dependencies.After;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.CharacterClasses;
+import com.yahoo.language.process.GramSplitter;
+import com.yahoo.prelude.Index;
+import com.yahoo.prelude.IndexFacts;
+import com.yahoo.prelude.hitfield.AnnotateStringFieldPart;
+import com.yahoo.prelude.hitfield.JSONString;
+import com.yahoo.prelude.hitfield.XMLString;
+import com.yahoo.prelude.query.*;
+import com.yahoo.search.Query;
+import com.yahoo.search.Result;
+import com.yahoo.search.Searcher;
+import com.yahoo.search.result.Hit;
+import com.yahoo.search.searchchain.Execution;
+
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import static com.yahoo.prelude.searcher.JuniperSearcher.JUNIPER_TAG_REPLACING;
+import static com.yahoo.language.LinguisticsCase.toLowerCase;
+
+/**
+ * Handles NGram indexes by splitting query terms to them into grams and combining summary field values
+ * from such fields into the original text.
+ * <p>
+ * This declares it must be placed after Juniper searchers because it assumes Juniper token separators
+ * (which are returned on bolding) are not replaced by highlight tags when this is run (and "after" means
+ * "before" from the point of view of the result).
+ *
+ * @author bratseth
+ */
+@After(JUNIPER_TAG_REPLACING)
+public class NGramSearcher extends Searcher {
+
+    private final GramSplitter gramSplitter;
+
+    private final CharacterClasses characterClasses;
+
+    public NGramSearcher(Linguistics linguistics) {
+        gramSplitter= linguistics.getGramSplitter();
+        characterClasses= linguistics.getCharacterClasses();
+    }
+
+    @Override
+    public Result search(Query query, Execution execution) {
+        IndexFacts indexFacts = execution.context().getIndexFacts();
+        if ( ! indexFacts.hasNGramIndices()) return execution.search(query); // shortcut
+
+        IndexFacts.Session session = indexFacts.newSession(query);
+        boolean rewritten = rewriteToNGramMatching(query.getModel().getQueryTree().getRoot(), 0, session, query);
+        if (rewritten)
+            query.trace("Rewritten to n-gram matching",true,2);
+
+        Result result=execution.search(query);
+        recombineNGrams(result.hits().deepIterator(), session);
+        return result;
+    }
+
+    @Override
+    public void fill(Result result, String summaryClass, Execution execution) {
+        execution.fill(result, summaryClass);
+        IndexFacts indexFacts = execution.context().getIndexFacts();
+        if (indexFacts.hasNGramIndices())
+            recombineNGrams(result.hits().deepIterator(), indexFacts.newSession(result.getQuery()));
+    }
+
+    private boolean rewriteToNGramMatching(Item item, int indexInParent, IndexFacts.Session indexFacts, Query query) {
+        boolean rewritten = false;
+        if (item instanceof SegmentItem) { // handle CJK segmented terms which should be grams instead
+            SegmentItem segments = (SegmentItem)item;
+            Index index = indexFacts.getIndex(segments.getIndexName());
+            if (index.isNGram()) {
+                Item grams = splitToGrams(segments, toLowerCase(segments.getRawWord()), index.getGramSize(), query);
+                replaceItemByGrams(item, grams, indexInParent);
+                rewritten = true;
+            }
+        }
+        else if (item instanceof CompositeItem) {
+            CompositeItem composite = (CompositeItem)item;
+            for (int i=0; i<composite.getItemCount(); i++)
+                rewritten = rewriteToNGramMatching(composite.getItem(i), i, indexFacts, query) || rewritten;
+        }
+        else if (item instanceof TermItem) {
+            TermItem term = (TermItem)item;
+            Index index = indexFacts.getIndex(term.getIndexName());
+            if (index.isNGram()) {
+                Item grams = splitToGrams(term,term.stringValue(), index.getGramSize(), query);
+                replaceItemByGrams(item, grams, indexInParent);
+                rewritten = true;
+            }
+        }
+        return rewritten;
+    }
+
+    /**
+     * Splits the given item into n-grams and adds them as a CompositeItem containing WordItems searching the
+     * index of the input term. If the result is a single gram, that single WordItem is returned rather than the AndItem
+     *
+     * @param term the term to split, must be an item which implement the IndexedItem and BlockItem "mixins"
+     * @param text the text of the item, just stringValue() if the item is a TermItem
+     * @param gramSize the gram size to split to
+     * @param query the query in which this rewriting is done
+     * @return the root of the query subtree produced by this, containing the split items
+     */
+    protected Item splitToGrams(Item term, String text, int gramSize, Query query) {
+        CompositeItem and = createGramRoot(query);
+        String index = ((HasIndexItem)term).getIndexName();
+        Substring origin = ((BlockItem)term).getOrigin();
+        for (Iterator<GramSplitter.Gram> i = getGramSplitter().split(text,gramSize); i.hasNext(); ) {
+            GramSplitter.Gram gram = i.next();
+            WordItem gramWord = new WordItem(gram.extractFrom(text), index, false, origin);
+            gramWord.setWeight(term.getWeight());
+            gramWord.setProtected(true);
+            and.addItem(gramWord);
+        }
+        return and.getItemCount()==1 ? and.getItem(0) : and; // return the AndItem, or just the single gram if not multiple
+    }
+
+    /**
+     * Returns the (thread-safe) object to use to split the query text into grams.
+     */
+    protected final GramSplitter getGramSplitter() { return gramSplitter; }
+
+    /**
+     * Creates the root of the query subtree which will contain the grams to match,
+     * called by {@link #splitToGrams}. This hook is provided to make it easy to create a subclass which
+     * matches grams using a different composite item, e.g an OrItem.
+     * <p>
+     * This default implementation return new AndItem();
+     *
+     * @param query the input query, to make it possible to return a different composite item type
+     *        depending on the query content
+     * @return the composite item to add the gram items to in {@link #splitToGrams}
+     */
+    protected CompositeItem createGramRoot(Query query) {
+        return new AndItem();
+    }
+
+    private void replaceItemByGrams(Item item, Item grams, int indexInParent) {
+        if (!(grams instanceof CompositeItem) || !(item.getParent() instanceof PhraseItem)) { // usually, simply replace
+            item.getParent().setItem(indexInParent, grams);
+        }
+        else { // but if the parent is a phrase, we cannot add the AND to it, so add each gram to the phrase
+            PhraseItem phraseParent = (PhraseItem)item.getParent();
+            phraseParent.removeItem(indexInParent);
+            int addedTerms = 0;
+            for (Iterator<Item> i = ((CompositeItem)grams).getItemIterator(); i.hasNext(); ) {
+                phraseParent.addItem(indexInParent+(addedTerms++),i.next());
+            }
+        }
+    }
+
+    private void recombineNGrams(Iterator<Hit> hits, IndexFacts.Session session) {
+        while (hits.hasNext()) {
+            Hit hit = hits.next();
+            if (hit.isMeta()) continue;
+            Object sddocname = hit.getField(Hit.SDDOCNAME_FIELD);
+            if (sddocname == null) return;
+            for (String fieldName : hit.fieldKeys()) {
+                Index index = session.getIndex(fieldName, sddocname.toString());
+                if (index.isNGram() && (index.getHighlightSummary() || index.getDynamicSummary())) {
+                    hit.setField(fieldName, recombineNGramsField(hit.getField(fieldName), index.getGramSize()));
+                }
+            }
+        }
+    }
+
+    private Object recombineNGramsField(Object fieldValue,int gramSize) {
+        String recombined=recombineNGrams(fieldValue.toString(),gramSize);
+        if (fieldValue instanceof JSONString)
+            return new JSONString(recombined);
+        else if (fieldValue instanceof XMLString)
+            return new XMLString(recombined);
+        else
+            return recombined;
+    }
+
+    /**
+     * Converts grams to the original string.
+     *
+     * Example (gram size 3): <code>blulue rededs</code> becomes <code>blue reds</code>
+     */
+    private String recombineNGrams(final String string,final int gramSize) {
+        StringBuilder b=new StringBuilder();
+        int consecutiveWordChars=0;
+        boolean inBolding=false;
+        MatchTokenStrippingCharacterIterator characters=new MatchTokenStrippingCharacterIterator(string);
+        while (characters.hasNext()) {
+            char c=characters.next();
+            boolean atBoldingSeparator = (c=='\u001f');
+
+            if (atBoldingSeparator && characters.peek()=='\u001f') {
+                characters.next();
+            }
+            else if ( ! characterClasses.isLetterOrDigit(c)) {
+                if (atBoldingSeparator)
+                    inBolding=!inBolding;
+                if ( ! (atBoldingSeparator && nextIsLetterOrDigit(characters)))
+                    consecutiveWordChars=0;
+                if (inBolding && atBoldingSeparator && areWordCharactersBackwards(gramSize-1,b)) {
+                    // we are going to skip characters from a gram, so move bolding start earlier
+                    b.insert(b.length()-(gramSize-1),c);
+                }
+                else {
+                    b.append(c);
+                }
+            }
+            else {
+                consecutiveWordChars++;
+                if (consecutiveWordChars<gramSize || (consecutiveWordChars % gramSize)==0)
+                    b.append(c);
+            }
+        }
+        return b.toString();
+    }
+
+    private boolean areWordCharactersBackwards(int count,StringBuilder b) {
+        for (int i=0; i<count; i++) {
+            int checkIndex=b.length()-1-i;
+            if (checkIndex<0) return false;
+            if ( ! characterClasses.isLetterOrDigit(b.charAt(checkIndex))) return false;
+        }
+        return true;
+    }
+
+    private boolean nextIsLetterOrDigit(MatchTokenStrippingCharacterIterator characters) {
+        return characterClasses.isLetterOrDigit(characters.peek());
+    }
+
+    /**
+     * A string wrapper which skips match token forms marked up Juniper style, such that
+     * \uFFF9originalToken\uFFFAtoken\uFFFB is returned as originalToken
+     */
+    private static class MatchTokenStrippingCharacterIterator {
+
+        private final String s;
+        private int current =0;
+
+        public MatchTokenStrippingCharacterIterator(String s) {
+            this.s=s;
+        }
+
+        public boolean hasNext() {
+            skipMarkup();
+            return current <s.length();
+        }
+
+        public char next() {
+            skipMarkup();
+            return s.charAt(current++);
+        }
+
+        /** Returns the next character without moving to it. Returns \uFFFF if there is no next */
+        public char peek() {
+            skipMarkup();
+            if (s.length()< current +1)
+                return '\uFFFF';
+            else
+                return s.charAt(current);
+        }
+
+        private void skipMarkup() {
+            if (current>=s.length()) return;
+            char c=s.charAt(current);
+            if (c== AnnotateStringFieldPart.RAW_ANNOTATE_BEGIN_CHAR) { // skip it
+                current++;
+            }
+            else if (c==AnnotateStringFieldPart.RAW_ANNOTATE_SEPARATOR_CHAR) { // skip to RAW_ANNOTATE_END_CHAR
+                do {
+                    current++;
+                } while (current<s.length() && s.charAt(current)!=AnnotateStringFieldPart.RAW_ANNOTATE_END_CHAR);
+                current++; // also skip the RAW_ANNOTATE_END_CHAR
+                skipMarkup(); // skip any immediately following markup
+            }
+        }
+
+    }
+
+}