// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.querytransform; import com.yahoo.component.chain.dependencies.After; import com.yahoo.language.Linguistics; import com.yahoo.language.process.CharacterClasses; import com.yahoo.language.process.GramSplitter; import com.yahoo.prelude.Index; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.hitfield.AnnotateStringFieldPart; import com.yahoo.prelude.hitfield.JSONString; import com.yahoo.prelude.hitfield.XMLString; import com.yahoo.prelude.query.AndItem; import com.yahoo.prelude.query.BlockItem; import com.yahoo.prelude.query.CompositeItem; import com.yahoo.prelude.query.HasIndexItem; import com.yahoo.prelude.query.Item; import com.yahoo.prelude.query.PhraseItem; import com.yahoo.prelude.query.SegmentItem; import com.yahoo.prelude.query.Substring; import com.yahoo.prelude.query.TermItem; import com.yahoo.prelude.query.WordItem; import com.yahoo.search.Query; import com.yahoo.search.Result; import com.yahoo.search.Searcher; import com.yahoo.search.result.Hit; import com.yahoo.search.searchchain.Execution; import java.util.Iterator; import static com.yahoo.prelude.searcher.JuniperSearcher.JUNIPER_TAG_REPLACING; import static com.yahoo.language.LinguisticsCase.toLowerCase; /** * Handles NGram indexes by splitting query terms to them into grams and combining summary field values * from such fields into the original text. *
* This declares it must be placed after Juniper searchers because it assumes Juniper token separators
* (which are returned on bolding) are not replaced by highlight tags when this is run (and "after" means
* "before" from the point of view of the result).
*
* @author bratseth
*/
@After(JUNIPER_TAG_REPLACING)
public class NGramSearcher extends Searcher {
private final GramSplitter gramSplitter;
private final CharacterClasses characterClasses;
public NGramSearcher(Linguistics linguistics) {
gramSplitter = linguistics.getGramSplitter();
characterClasses = linguistics.getCharacterClasses();
}
@Override
public Result search(Query query, Execution execution) {
IndexFacts indexFacts = execution.context().getIndexFacts();
if ( ! indexFacts.hasNGramIndices()) return execution.search(query); // shortcut
IndexFacts.Session session = indexFacts.newSession(query);
boolean rewritten = rewriteToNGramMatching(query.getModel().getQueryTree().getRoot(), 0, session, query);
if (rewritten)
query.trace("Rewritten to n-gram matching", true, 2);
Result result = execution.search(query);
recombineNGrams(result.hits().deepIterator(), session);
return result;
}
@Override
public void fill(Result result, String summaryClass, Execution execution) {
execution.fill(result, summaryClass);
IndexFacts indexFacts = execution.context().getIndexFacts();
if (indexFacts.hasNGramIndices())
recombineNGrams(result.hits().deepIterator(), indexFacts.newSession(result.getQuery()));
}
private boolean rewriteToNGramMatching(Item item, int indexInParent, IndexFacts.Session indexFacts, Query query) {
boolean rewritten = false;
if (item instanceof SegmentItem segments) { // handle CJK segmented terms which should be grams instead
Index index = indexFacts.getIndex(segments.getIndexName());
if (index.isNGram()) {
Item grams = splitToGrams(segments, toLowerCase(segments.getRawWord()), index.getGramSize(), query);
replaceItemByGrams(item, grams, indexInParent);
rewritten = true;
}
}
else if (item instanceof CompositeItem composite) {
for (int i = 0; i < composite.getItemCount(); i++)
rewritten = rewriteToNGramMatching(composite.getItem(i), i, indexFacts, query) || rewritten;
}
else if (item instanceof TermItem term) {
Index index = indexFacts.getIndex(term.getIndexName());
if (index.isNGram()) {
Item grams = splitToGrams(term,term.stringValue(), index.getGramSize(), query);
replaceItemByGrams(item, grams, indexInParent);
rewritten = true;
}
}
return rewritten;
}
/**
* Splits the given item into n-grams and adds them as a CompositeItem containing WordItems searching the
* index of the input term. If the result is a single gram, that single WordItem is returned rather than the AndItem
*
* @param term the term to split, must be an item which implement the IndexedItem and BlockItem "mixins"
* @param text the text of the item, just stringValue() if the item is a TermItem
* @param gramSize the gram size to split to
* @param query the query in which this rewriting is done
* @return the root of the query subtree produced by this, containing the split items
*/
protected Item splitToGrams(Item term, String text, int gramSize, Query query) {
String index = ((HasIndexItem)term).getIndexName();
CompositeItem gramsItem = createGramRoot((HasIndexItem)term, query);
gramsItem.setIndexName(index);
Substring origin = ((BlockItem)term).getOrigin();
for (Iteratorblulue rededs
becomes blue reds
*/
private String recombineNGrams(final String string,final int gramSize) {
StringBuilder b = new StringBuilder();
int consecutiveWordChars = 0;
boolean inBolding = false;
MatchTokenStrippingCharacterIterator characters = new MatchTokenStrippingCharacterIterator(string);
while (characters.hasNext()) {
char c = characters.next();
boolean atBoldingSeparator = (c == '\u001f');
if (atBoldingSeparator && characters.peek() == '\u001f') {
characters.next();
}
else if ( ! characterClasses.isLetterOrDigit(c)) {
if (atBoldingSeparator)
inBolding =! inBolding;
if ( ! (atBoldingSeparator && nextIsLetterOrDigit(characters)))
consecutiveWordChars = 0;
if (inBolding && atBoldingSeparator && areWordCharactersBackwards(gramSize - 1, b)) {
// we are going to skip characters from a gram, so move bolding start earlier
b.insert(b.length() - (gramSize-1), c);
}
else {
b.append(c);
}
}
else {
consecutiveWordChars++;
if (consecutiveWordChars < gramSize || (consecutiveWordChars % gramSize) == 0)
b.append(c);
}
}
return b.toString();
}
private boolean areWordCharactersBackwards(int count,StringBuilder b) {
for (int i = 0; i < count; i++) {
int checkIndex = b.length()-1-i;
if (checkIndex < 0) return false;
if ( ! characterClasses.isLetterOrDigit(b.charAt(checkIndex))) return false;
}
return true;
}
private boolean nextIsLetterOrDigit(MatchTokenStrippingCharacterIterator characters) {
return characterClasses.isLetterOrDigit(characters.peek());
}
/**
* A string wrapper which skips match token forms marked up Juniper style, such that
* \uFFF9originalToken\uFFFAtoken\uFFFB is returned as originalToken
*/
private static class MatchTokenStrippingCharacterIterator {
private final String s;
private int current = 0;
public MatchTokenStrippingCharacterIterator(String s) {
this.s = s;
}
public boolean hasNext() {
skipMarkup();
return current < s.length();
}
public char next() {
skipMarkup();
return s.charAt(current++);
}
/** Returns the next character without moving to it. Returns \uFFFF if there is no next */
public char peek() {
skipMarkup();
if (s.length() < current +1)
return '\uFFFF';
else
return s.charAt(current);
}
private void skipMarkup() {
if (current >= s.length()) return;
char c = s.charAt(current);
if (c == AnnotateStringFieldPart.RAW_ANNOTATE_BEGIN_CHAR) { // skip it
current++;
}
else if (c==AnnotateStringFieldPart.RAW_ANNOTATE_SEPARATOR_CHAR) { // skip to RAW_ANNOTATE_END_CHAR
do {
current++;
} while (current < s.length() && s.charAt(current) != AnnotateStringFieldPart.RAW_ANNOTATE_END_CHAR);
current++; // also skip the RAW_ANNOTATE_END_CHAR
skipMarkup(); // skip any immediately following markup
}
}
}
}