// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.querytransform; import com.yahoo.component.chain.dependencies.After; import com.yahoo.language.Linguistics; import com.yahoo.language.process.CharacterClasses; import com.yahoo.language.process.GramSplitter; import com.yahoo.prelude.Index; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.hitfield.AnnotateStringFieldPart; import com.yahoo.prelude.hitfield.JSONString; import com.yahoo.prelude.hitfield.XMLString; import com.yahoo.prelude.query.*; import com.yahoo.search.Query; import com.yahoo.search.Result; import com.yahoo.search.Searcher; import com.yahoo.search.result.Hit; import com.yahoo.search.searchchain.Execution; import java.util.Iterator; import static com.yahoo.prelude.searcher.JuniperSearcher.JUNIPER_TAG_REPLACING; import static com.yahoo.language.LinguisticsCase.toLowerCase; /** * Handles NGram indexes by splitting query terms to them into grams and combining summary field values * from such fields into the original text. *
* This declares it must be placed after Juniper searchers because it assumes Juniper token separators
* (which are returned on bolding) are not replaced by highlight tags when this is run (and "after" means
* "before" from the point of view of the result).
*
* @author bratseth
*/
@After(JUNIPER_TAG_REPLACING)
public class NGramSearcher extends Searcher {
private final GramSplitter gramSplitter;
private final CharacterClasses characterClasses;
public NGramSearcher(Linguistics linguistics) {
gramSplitter = linguistics.getGramSplitter();
characterClasses = linguistics.getCharacterClasses();
}
@Override
public Result search(Query query, Execution execution) {
IndexFacts indexFacts = execution.context().getIndexFacts();
if ( ! indexFacts.hasNGramIndices()) return execution.search(query); // shortcut
IndexFacts.Session session = indexFacts.newSession(query);
boolean rewritten = rewriteToNGramMatching(query.getModel().getQueryTree().getRoot(), 0, session, query);
if (rewritten)
query.trace("Rewritten to n-gram matching",true,2);
Result result = execution.search(query);
recombineNGrams(result.hits().deepIterator(), session);
return result;
}
@Override
public void fill(Result result, String summaryClass, Execution execution) {
execution.fill(result, summaryClass);
IndexFacts indexFacts = execution.context().getIndexFacts();
if (indexFacts.hasNGramIndices())
recombineNGrams(result.hits().deepIterator(), indexFacts.newSession(result.getQuery()));
}
private boolean rewriteToNGramMatching(Item item, int indexInParent, IndexFacts.Session indexFacts, Query query) {
boolean rewritten = false;
if (item instanceof SegmentItem) { // handle CJK segmented terms which should be grams instead
SegmentItem segments = (SegmentItem)item;
Index index = indexFacts.getIndex(segments.getIndexName());
if (index.isNGram()) {
Item grams = splitToGrams(segments, toLowerCase(segments.getRawWord()), index.getGramSize(), query);
replaceItemByGrams(item, grams, indexInParent);
rewritten = true;
}
}
else if (item instanceof CompositeItem) {
CompositeItem composite = (CompositeItem)item;
for (int i=0; i
* This default implementation returns createGramRoot(query).
*
* @param term the term item this gram root is replacing in the query tree,
* typically used to access the index name of the term when that is required by the new gram root
* (such as in PhraseItem)
* @param query the input query, to make it possible to return a different composite item type
* depending on the query content
* @return the composite item to add the gram items to in {@link #splitToGrams}
*/
protected CompositeItem createGramRoot(HasIndexItem term, Query query) {
return createGramRoot(query);
}
/** Creates the root of the query subtree without access to the term being replaced. */
protected CompositeItem createGramRoot(Query query) {
return new AndItem();
}
private void replaceItemByGrams(Item item, Item grams, int indexInParent) {
if (!(grams instanceof CompositeItem) || !(item.getParent() instanceof PhraseItem)) { // usually, simply replace
item.getParent().setItem(indexInParent, grams);
}
else { // but if the parent is a phrase, we cannot add the AND to it, so add each gram to the phrase
PhraseItem phraseParent = (PhraseItem)item.getParent();
phraseParent.removeItem(indexInParent);
int addedTerms = 0;
for (Iteratorblulue rededs
becomes blue reds
*/
private String recombineNGrams(final String string,final int gramSize) {
StringBuilder b=new StringBuilder();
int consecutiveWordChars=0;
boolean inBolding=false;
MatchTokenStrippingCharacterIterator characters=new MatchTokenStrippingCharacterIterator(string);
while (characters.hasNext()) {
char c=characters.next();
boolean atBoldingSeparator = (c=='\u001f');
if (atBoldingSeparator && characters.peek()=='\u001f') {
characters.next();
}
else if ( ! characterClasses.isLetterOrDigit(c)) {
if (atBoldingSeparator)
inBolding=!inBolding;
if ( ! (atBoldingSeparator && nextIsLetterOrDigit(characters)))
consecutiveWordChars=0;
if (inBolding && atBoldingSeparator && areWordCharactersBackwards(gramSize-1,b)) {
// we are going to skip characters from a gram, so move bolding start earlier
b.insert(b.length()-(gramSize-1),c);
}
else {
b.append(c);
}
}
else {
consecutiveWordChars++;
if (consecutiveWordChars