// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.prelude.querytransform;
import com.yahoo.fsa.FSA;
import com.yahoo.prelude.query.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import static com.yahoo.language.LinguisticsCase.toLowerCase;
/**
* Detects query phrases using an automaton. This class is thread safe.
*
* @author bratseth
*/
public class PhraseMatcher {
private FSA phraseFSA = null;
private boolean matchPhraseItems = false;
private boolean matchSingleItems = false;
/** Whether this should ignore regular plural/singular form differences when matching */
private boolean ignorePluralForm = false;
/** False to matche the longest phrase, true to match all phrases */
private boolean matchAll = false;
/** For null subclass only */
private PhraseMatcher() {
}
/**
* Creates a phrase matcher. This will not ignore plural/singular form differences when matching
*
* @param phraseAutomatonFile the file containing phrases to match
* @throws IllegalArgumentException if the file is not found
*/
public PhraseMatcher(String phraseAutomatonFile) {
this(phraseAutomatonFile,false);
}
/**
* Creates a phrase matcher
*
* @param phraseAutomatonFile the file containing phrases to match
* @param ignorePluralForm whether we should ignore plural and singular forms as matches
* @throws IllegalArgumentException if the file is not found
*/
public PhraseMatcher(String phraseAutomatonFile,boolean ignorePluralForm) {
this.ignorePluralForm=ignorePluralForm;
phraseFSA=new FSA(phraseAutomatonFile);
}
/**
* Creates a phrase matcher
*
* @param phraseAutomatonFSA the fsa containing phrases to match
* @param ignorePluralForm whether we should ignore plural and singular forms as matches
* @throws IllegalArgumentException if FSA is null
*/
public PhraseMatcher(FSA phraseAutomatonFSA,boolean ignorePluralForm) {
if(phraseAutomatonFSA==null) throw new IllegalArgumentException("FSA is null");
this.ignorePluralForm=ignorePluralForm;
phraseFSA=phraseAutomatonFSA;
}
public boolean isEmpty() { return phraseFSA == null; }
/**
* Set whether to match words contained in phrase items as well.
* Default is false - don't match words contained in phrase items
*/
public void setMatchPhraseItems(boolean matchPhraseItems) {
this.matchPhraseItems=matchPhraseItems;
}
/**
* Sets whether single items should be matched and returned as phrase matches.
* Default is false.
*/
public void setMatchSingleItems(boolean matchSingleItems) {
this.matchSingleItems=matchSingleItems;
}
/** Sets whether we should ignore plural/singular form when matching */
public void setIgnorePluralForm(boolean ignorePluralForm) { this.ignorePluralForm=ignorePluralForm; }
/**
* Sets whether to return the longest matching phrase when there are overlapping matches (default),
* or all matching phrases
*/
public void setMatchAll(boolean matchAll) { this.matchAll =matchAll; }
/**
* Finds all phrases (word sequences of length 1 or higher)
* of the same index, not negative items of a notitem,
* which constitutes a complete entry in the automaton of this matcher
*
* @param queryItem the root query item in which to match phrases
* @return the matched phrases, or null if there was no matches
*/
public List matchPhrases(Item queryItem) {
if (matchSingleItems && (queryItem instanceof TermItem)) {
return matchSingleItem((TermItem)queryItem);
}
else {
MatchedPhrases phrases=new MatchedPhrases();
recursivelyMatchPhrases(queryItem,phrases);
return phrases.toList();
}
}
/** Returns null if this word does not match the automaton, a single-item list if it does */
private List matchSingleItem(TermItem termItem) {
String matchWord=toLowerCase(termItem.stringValue());
String replaceWord=null;
FSA.State state = phraseFSA.getState();
if (!matches(state,matchWord)) {
if (!ignorePluralForm) return null;
matchWord=switchForm(matchWord);
if (!matches(state,matchWord)) return null;
replaceWord=matchWord;
}
List itemList=new java.util.ArrayList<>(1);
itemList.add(new Phrase(termItem,replaceWord,state.dataString()));
return itemList;
}
private boolean matches(FSA.State state,String word) {
state.start();
state.delta(word);
return state.isFinal();
}
/** Find matches within a composite */
private void recursivelyMatchPhrases(Item item, MatchedPhrases phrases) {
if (item == null) return;
if ( ! (item instanceof CompositeItem) ) return;
if ( ! matchPhraseItems && item instanceof PhraseItem ) return;
CompositeItem owner=(CompositeItem)item;
int i=0;
int checkItemCount=owner.getItemCount();
if (owner instanceof NotItem)
checkItemCount=1; // Skip negatives
while (i replaceList=null;
String index=null;
state.start();
while (currentIndex setReplace(List replaceList,int index,String invertedWord) {
if (replaceList==null)
replaceList=new ArrayList<>();
while (replaceList.size()2)
return word.substring(0,word.length()-1);
return word + "s";
}
/** Holder of a lazily created list of matched phrases */
private static class MatchedPhrases {
private List phrases=null;
private void add(Phrase phrase) {
if (phrase==null) return;
if (phrases==null)
phrases=new java.util.ArrayList<>(5);
phrases.add(phrase);
}
/** Returns the list of contained phrases, or null */
public List toList() { return phrases; }
}
/**
* Points to a collection of word items (one or more)
* which is matches a complete listing in an automat
*/
public static class Phrase {
/** Points to the single or multiple words matched by this phrase */
private Matched matched;
private String data;
private Phrase(Matched matched,String data) {
this.matched=matched;
this.data=data;
}
public Phrase(TermItem item,String replace,String data) {
this(new MatchedWord(item,replace),data);
}
/**
* Creates a phrase match
*
* @param owner the composite we have matched within
* @param replace the list of string to replace the matched by, or null to not replace.
* This transfers ownership of this list to this class - it can not subsequently be accessed
* by the caller. If this list is set, it must have the same length as length
.
* No replacement is represented by null items within the list.
* @param startIndex the first index in composite to match
* @param length the length of the matched terms
* @param data the data accompanying this match
*/
private Phrase(CompositeItem owner,List replace,int startIndex,int length,String data) {
this(new MatchedComposite(owner,replace,startIndex,length),data);
}
/** Returns the owner, or null if this is a single item phrase with no owner */
public CompositeItem getOwner() { return matched.getOwner(); }
public int getStartIndex() { return matched.getStartIndex(); }
public int getLength() { return matched.getLength(); }
/** Returns the data stored by the automaton for this phrase at this position, or null if none */
public String getData() { return data; }
/** Returns the n'th item in this, throws if index out of bounds */
public TermItem getItem(int index) {
return matched.getItem(index);
}
/** Returns true if this phrase contains all the words of the owner, or if there is no owner */
public boolean isComplete() {
return matched.isComplete();
}
/** Replaces the words items of this phrase with a phrase item. Does nothing if this is not a composite match */
public void replace() {
matched.replace();
}
/** Removes the word items of this phrase. Does nothing nuless this is a composite */
public void remove() {
matched.remove();
}
/** Returns the length of the underlying phrase */
public int getBackedLength() {
return matched.getBackedLength();
}
/** Returns the items of this phrase as a read-only iterator */
public MatchIterator itemIterator() {
return new MatchIterator(this);
}
public String toString() {
StringBuilder buffer=new StringBuilder("\"");
for (Iterator- i=itemIterator(); i.hasNext(); ) {
buffer.append(i.next().toString());
if (i.hasNext())
buffer.append(" ");
}
buffer.append("\"");
return buffer.toString();
}
private abstract static class Matched {
public abstract CompositeItem getOwner();
public abstract int getStartIndex();
public abstract int getLength();
public abstract boolean isComplete();
/** Returns whether there is an index at the current item */
public abstract boolean hasItemAt(int index);
public void replace() {}
public void remove() {}
public abstract TermItem getItem(int index);
public abstract String getReplace(int index);
/** Returns the length of the underlying item */
public abstract int getBackedLength();
public abstract boolean hasReplaces();
}
private static class MatchedWord extends Matched {
/** The term matched by this */
private TermItem item;
/** The word to replace the matched word by, or null to not replace */
private String replace;
public MatchedWord(TermItem item,String replace) {
this.item=item;
this.replace=replace;
}
public Item getItem() { return item; }
public boolean hasItemAt(int index) {
return index==0;
}
public CompositeItem getOwner() { return null; }
public int getStartIndex() { return 0; }
public int getLength() { return 1; }
@Override
public TermItem getItem(int index) {
if (index!=0) throw new IndexOutOfBoundsException("No word at " + index + " in " + this);
return item;
}
public boolean isComplete() { return true; }
public int getBackedLength() { return 1; }
public String getReplace(int index) { return replace; }
public boolean hasReplaces() { return replace!=null; }
}
private static class MatchedComposite extends Matched {
/** The item having the phrase words as direct descendants */
private CompositeItem owner;
/** The number of phrase items */
private int length;
private int initialOwnerLength;
/** The (0-base) index of the first phrase word item in the owner */
private int startIndex;
/** The first matched item */
private Item startItem;
/**
* The word to replace by at the given index, or null if none of the phrase words should be replaced
* This is either null, or of length
length
, with null values where nothing should be replaced
*/
private List replace=null;
public MatchedComposite(CompositeItem owner,List replace,int startIndex,int length) {
this.owner=owner;
this.initialOwnerLength=owner.getItemCount();
this.replace = replace;
this.startIndex=startIndex;
this.startItem=owner.getItem(startIndex);
this.length=length;
}
public CompositeItem getOwner() { return owner; }
public int getStartIndex() { return startIndex; }
public int getLength() { return length; }
public int getBackedLength() { return owner.getItemCount()-startIndex; }
public boolean hasItemAt(int index) {
adjustIfBackingChanged();
if (startIndex<0) return false; // Invalid state because of backing changes
if ( index >= length ) return false;
if ( index+startIndex >= owner.getItemCount() ) return false;
return true;
}
public boolean isComplete() {
return startIndex==0 && length==owner.getItemCount();
}
@Override
public TermItem getItem(int index) {
adjustIfBackingChanged();
return (TermItem)owner.getItem(startIndex+index);
}
public String getReplace(int index) {
if (replace==null) return null;
return replace.get(index);
}
public void replace() {
PhraseItem phrase=new PhraseItem();
TermItem firstWord=(TermItem)owner.setItem(startIndex,phrase);
replace(firstWord,0);
phrase.setIndexName(firstWord.getIndexName());
phrase.addItem(firstWord);
for (int i=1; i=startIndex; i--)
owner.removeItem(i);
}
public boolean hasReplaces() { return replace!=null; }
/**
* Detects and attemts to compensate for a changed backing. Stop-gap measure until we get a through
* design for this
*/
private void adjustIfBackingChanged() {
if (owner.getItemCount()==initialOwnerLength) return;
startIndex=owner.getItemIndex(startItem);
}
}
public static class MatchIterator implements Iterator- {
private Phrase phrase;
private int currentIndex=0;
public MatchIterator(Phrase phrase) {
this.phrase=phrase;
}
public boolean hasNext() {
return phrase.matched.hasItemAt(currentIndex);
//return (currentIndex matchPhrases(Item item) {
return null;
}
};
}
}