// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.querytransform; import com.yahoo.fsa.FSA; import com.yahoo.prelude.query.*; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; import static com.yahoo.language.LinguisticsCase.toLowerCase; /** * Detects query phrases using an automaton. This class is thread safe. * * @author bratseth */ public class PhraseMatcher { private FSA phraseFSA = null; private boolean matchPhraseItems = false; private boolean matchSingleItems = false; /** Whether this should ignore regular plural/singular form differences when matching */ private boolean ignorePluralForm = false; /** False to matche the longest phrase, true to match all phrases */ private boolean matchAll = false; /** For null subclass only */ private PhraseMatcher() { } /** * Creates a phrase matcher. This will not ignore plural/singular form differences when matching * * @param phraseAutomatonFile the file containing phrases to match * @throws IllegalArgumentException if the file is not found */ public PhraseMatcher(String phraseAutomatonFile) { this(phraseAutomatonFile,false); } /** * Creates a phrase matcher * * @param phraseAutomatonFile the file containing phrases to match * @param ignorePluralForm whether we should ignore plural and singular forms as matches * @throws IllegalArgumentException if the file is not found */ public PhraseMatcher(String phraseAutomatonFile,boolean ignorePluralForm) { this.ignorePluralForm=ignorePluralForm; phraseFSA=new FSA(phraseAutomatonFile); } /** * Creates a phrase matcher * * @param phraseAutomatonFSA the fsa containing phrases to match * @param ignorePluralForm whether we should ignore plural and singular forms as matches * @throws IllegalArgumentException if FSA is null */ public PhraseMatcher(FSA phraseAutomatonFSA,boolean ignorePluralForm) { if(phraseAutomatonFSA==null) throw new IllegalArgumentException("FSA is null"); this.ignorePluralForm=ignorePluralForm; phraseFSA=phraseAutomatonFSA; } public boolean isEmpty() { return phraseFSA == null; } /** * Set whether to match words contained in phrase items as well. * Default is false - don't match words contained in phrase items */ public void setMatchPhraseItems(boolean matchPhraseItems) { this.matchPhraseItems=matchPhraseItems; } /** * Sets whether single items should be matched and returned as phrase matches. * Default is false. */ public void setMatchSingleItems(boolean matchSingleItems) { this.matchSingleItems=matchSingleItems; } /** Sets whether we should ignore plural/singular form when matching */ public void setIgnorePluralForm(boolean ignorePluralForm) { this.ignorePluralForm=ignorePluralForm; } /** * Sets whether to return the longest matching phrase when there are overlapping matches (default), * or all matching phrases */ public void setMatchAll(boolean matchAll) { this.matchAll =matchAll; } /** * Finds all phrases (word sequences of length 1 or higher) * of the same index, not negative items of a notitem, * which constitutes a complete entry in the automaton of this matcher * * @param queryItem the root query item in which to match phrases * @return the matched phrases, or null if there was no matches */ public List matchPhrases(Item queryItem) { if (matchSingleItems && (queryItem instanceof TermItem)) { return matchSingleItem((TermItem)queryItem); } else { MatchedPhrases phrases=new MatchedPhrases(); recursivelyMatchPhrases(queryItem,phrases); return phrases.toList(); } } /** Returns null if this word does not match the automaton, a single-item list if it does */ private List matchSingleItem(TermItem termItem) { String matchWord=toLowerCase(termItem.stringValue()); String replaceWord=null; FSA.State state = phraseFSA.getState(); if (!matches(state,matchWord)) { if (!ignorePluralForm) return null; matchWord=switchForm(matchWord); if (!matches(state,matchWord)) return null; replaceWord=matchWord; } List itemList=new java.util.ArrayList<>(1); itemList.add(new Phrase(termItem,replaceWord,state.dataString())); return itemList; } private boolean matches(FSA.State state,String word) { state.start(); state.delta(word); return state.isFinal(); } /** Find matches within a composite */ private void recursivelyMatchPhrases(Item item, MatchedPhrases phrases) { if (item == null) return; if ( ! (item instanceof CompositeItem) ) return; if ( ! matchPhraseItems && item instanceof PhraseItem ) return; CompositeItem owner=(CompositeItem)item; int i=0; int checkItemCount=owner.getItemCount(); if (owner instanceof NotItem) checkItemCount=1; // Skip negatives while (i replaceList=null; String index=null; state.start(); while (currentIndex setReplace(List replaceList,int index,String invertedWord) { if (replaceList==null) replaceList=new ArrayList<>(); while (replaceList.size()2) return word.substring(0,word.length()-1); return word + "s"; } /** Holder of a lazily created list of matched phrases */ private static class MatchedPhrases { private List phrases=null; private void add(Phrase phrase) { if (phrase==null) return; if (phrases==null) phrases=new java.util.ArrayList<>(5); phrases.add(phrase); } /** Returns the list of contained phrases, or null */ public List toList() { return phrases; } } /** * Points to a collection of word items (one or more) * which is matches a complete listing in an automat */ public static class Phrase { /** Points to the single or multiple words matched by this phrase */ private Matched matched; private String data; private Phrase(Matched matched,String data) { this.matched=matched; this.data=data; } public Phrase(TermItem item,String replace,String data) { this(new MatchedWord(item,replace),data); } /** * Creates a phrase match * * @param owner the composite we have matched within * @param replace the list of string to replace the matched by, or null to not replace. * This transfers ownership of this list to this class - it can not subsequently be accessed * by the caller. If this list is set, it must have the same length as length. * No replacement is represented by null items within the list. * @param startIndex the first index in composite to match * @param length the length of the matched terms * @param data the data accompanying this match */ private Phrase(CompositeItem owner,List replace,int startIndex,int length,String data) { this(new MatchedComposite(owner,replace,startIndex,length),data); } /** Returns the owner, or null if this is a single item phrase with no owner */ public CompositeItem getOwner() { return matched.getOwner(); } public int getStartIndex() { return matched.getStartIndex(); } public int getLength() { return matched.getLength(); } /** Returns the data stored by the automaton for this phrase at this position, or null if none */ public String getData() { return data; } /** Returns the n'th item in this, throws if index out of bounds */ public TermItem getItem(int index) { return matched.getItem(index); } /** Returns true if this phrase contains all the words of the owner, or if there is no owner */ public boolean isComplete() { return matched.isComplete(); } /** Replaces the words items of this phrase with a phrase item. Does nothing if this is not a composite match */ public void replace() { matched.replace(); } /** Removes the word items of this phrase. Does nothing nuless this is a composite */ public void remove() { matched.remove(); } /** Returns the length of the underlying phrase */ public int getBackedLength() { return matched.getBackedLength(); } /** Returns the items of this phrase as a read-only iterator */ public MatchIterator itemIterator() { return new MatchIterator(this); } public String toString() { StringBuilder buffer=new StringBuilder("\""); for (Iterator i=itemIterator(); i.hasNext(); ) { buffer.append(i.next().toString()); if (i.hasNext()) buffer.append(" "); } buffer.append("\""); return buffer.toString(); } private abstract static class Matched { public abstract CompositeItem getOwner(); public abstract int getStartIndex(); public abstract int getLength(); public abstract boolean isComplete(); /** Returns whether there is an index at the current item */ public abstract boolean hasItemAt(int index); public void replace() {} public void remove() {} public abstract TermItem getItem(int index); public abstract String getReplace(int index); /** Returns the length of the underlying item */ public abstract int getBackedLength(); public abstract boolean hasReplaces(); } private static class MatchedWord extends Matched { /** The term matched by this */ private TermItem item; /** The word to replace the matched word by, or null to not replace */ private String replace; public MatchedWord(TermItem item,String replace) { this.item=item; this.replace=replace; } public Item getItem() { return item; } public boolean hasItemAt(int index) { return index==0; } public CompositeItem getOwner() { return null; } public int getStartIndex() { return 0; } public int getLength() { return 1; } @Override public TermItem getItem(int index) { if (index!=0) throw new IndexOutOfBoundsException("No word at " + index + " in " + this); return item; } public boolean isComplete() { return true; } public int getBackedLength() { return 1; } public String getReplace(int index) { return replace; } public boolean hasReplaces() { return replace!=null; } } private static class MatchedComposite extends Matched { /** The item having the phrase words as direct descendants */ private CompositeItem owner; /** The number of phrase items */ private int length; private int initialOwnerLength; /** The (0-base) index of the first phrase word item in the owner */ private int startIndex; /** The first matched item */ private Item startItem; /** * The word to replace by at the given index, or null if none of the phrase words should be replaced * This is either null, or of length length, with null values where nothing should be replaced */ private List replace=null; public MatchedComposite(CompositeItem owner,List replace,int startIndex,int length) { this.owner=owner; this.initialOwnerLength=owner.getItemCount(); this.replace = replace; this.startIndex=startIndex; this.startItem=owner.getItem(startIndex); this.length=length; } public CompositeItem getOwner() { return owner; } public int getStartIndex() { return startIndex; } public int getLength() { return length; } public int getBackedLength() { return owner.getItemCount()-startIndex; } public boolean hasItemAt(int index) { adjustIfBackingChanged(); if (startIndex<0) return false; // Invalid state because of backing changes if ( index >= length ) return false; if ( index+startIndex >= owner.getItemCount() ) return false; return true; } public boolean isComplete() { return startIndex==0 && length==owner.getItemCount(); } @Override public TermItem getItem(int index) { adjustIfBackingChanged(); return (TermItem)owner.getItem(startIndex+index); } public String getReplace(int index) { if (replace==null) return null; return replace.get(index); } public void replace() { PhraseItem phrase=new PhraseItem(); TermItem firstWord=(TermItem)owner.setItem(startIndex,phrase); replace(firstWord,0); phrase.setIndexName(firstWord.getIndexName()); phrase.addItem(firstWord); for (int i=1; i=startIndex; i--) owner.removeItem(i); } public boolean hasReplaces() { return replace!=null; } /** * Detects and attemts to compensate for a changed backing. Stop-gap measure until we get a through * design for this */ private void adjustIfBackingChanged() { if (owner.getItemCount()==initialOwnerLength) return; startIndex=owner.getItemIndex(startItem); } } public static class MatchIterator implements Iterator { private Phrase phrase; private int currentIndex=0; public MatchIterator(Phrase phrase) { this.phrase=phrase; } public boolean hasNext() { return phrase.matched.hasItemAt(currentIndex); //return (currentIndex matchPhrases(Item item) { return null; } }; } }