// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.rewrite.rewriters; import java.io.*; import java.util.*; import java.util.logging.Logger; import com.google.inject.Inject; import com.yahoo.component.chain.dependencies.Provides; import com.yahoo.fsa.FSA; import com.yahoo.search.query.rewrite.*; import com.yahoo.search.*; import com.yahoo.component.ComponentId; import com.yahoo.filedistribution.fileacquirer.FileAcquirer; import com.yahoo.search.query.rewrite.RewritesConfig; import com.yahoo.prelude.querytransform.PhraseMatcher; /** * This rewriter would add rewrites to entities (e.g abbreviation, synonym, etc)
* to boost precision * - FSA dict: [normalized original query]\t[rewrite 1]\t[rewrite 2]\t[etc]
* - Features:
* RewritesAsUnitEquiv flag: add proximity boosted rewrites
* PartialPhraseMatch flag: whether to match whole phrase or partial phrase
* MaxRewrites flag: the maximum number of rewrites to be added
* * @author Karen Sze Wing Lee */ @Provides("GenericExpansionRewriter") public class GenericExpansionRewriter extends QueryRewriteSearcher { // Flag for skipping this rewriter if the query has been rewritten private final boolean SKIP_REWRITER_IF_REWRITTEN = false; // Name of the rewriter public static final String REWRITER_NAME = "GenericExpansionRewriter"; // Generic expansion dictionary name public static final String GENERIC_EXPAND_DICT = "GenericExpansion"; // Default generic expansion dictionary file name public static final String GENERIC_EXPAND_DICT_FILENAME = "GenericExpansionRewriter.fsa"; // PhraseMatcher created from FSA dict private PhraseMatcher phraseMatcher; private Logger logger; /** * Constructor for GenericExpansionRewriter. * Load configs using default format */ @Inject public GenericExpansionRewriter(ComponentId id, FileAcquirer fileAcquirer, RewritesConfig config) { super(id, fileAcquirer, config); } /** * Constructor for GenericExpansionRewriter unit test. * Load configs using default format */ public GenericExpansionRewriter(RewritesConfig config, HashMap fileList) { super(config, fileList); } /** * Instance creation time config loading besides FSA. * Create PhraseMatcher from FSA dict */ public boolean configure(FileAcquirer fileAcquirer, RewritesConfig config, HashMap fileList) { logger = Logger.getLogger(GenericExpansionRewriter.class.getName()); FSA fsa = (FSA)rewriterDicts.get(GENERIC_EXPAND_DICT); if (fsa==null) { RewriterUtils.error(logger, "Error retrieving FSA dictionary: " + GENERIC_EXPAND_DICT); return false; } // Create Phrase Matcher RewriterUtils.log(logger, "Creating PhraseMatcher"); try { phraseMatcher = new PhraseMatcher(fsa, false); } catch (IllegalArgumentException e) { RewriterUtils.error(logger, "Error creating phrase matcher"); return false; } // Match single word as well phraseMatcher.setMatchSingleItems(true); // Return all matches instead of only the longest match phraseMatcher.setMatchAll(true); return true; } /** * Main logic of rewriter
* - Retrieve rewrites from FSA dict
* - rewrite query using features that are enabled by user */ public HashMap rewrite(Query query, String dictKey) throws RuntimeException { Boolean rewritten = false; // Pass the original dict key to the next rewriter HashMap result = new HashMap<>(); result.put(RewriterConstants.REWRITTEN, rewritten); result.put(RewriterConstants.DICT_KEY, dictKey); RewriterUtils.log(logger, query, "In GenericExpansionRewriter, query used for dict retrieval=[" + dictKey + "]"); // Retrieve flags for choosing between whole query match // or partial query match String partialPhraseMatch = getQPConfig(query, RewriterConstants.PARTIAL_PHRASE_MATCH); if(partialPhraseMatch==null) { RewriterUtils.error(logger, query, "Required param " + RewriterConstants.PARTIAL_PHRASE_MATCH + " is not set, skipping rewriter"); throw new RuntimeException("Required param " + RewriterConstants.PARTIAL_PHRASE_MATCH + " is not set, skipping rewriter"); } // Retrieve max number of rewrites allowed int maxNumRewrites = 0; String maxNumRewritesStr = getQPConfig(query, RewriterConstants.MAX_REWRITES); if(maxNumRewritesStr!=null) { maxNumRewrites = Integer.parseInt(maxNumRewritesStr); RewriterUtils.log(logger, query, "Limiting max number of rewrites to: " + maxNumRewrites); } else { RewriterUtils.log(logger, query, "No limit on number of rewrites"); } // Retrieve flags for choosing whether to add // the rewrites as phrase, default to false String rewritesAsUnitEquiv = getQPConfig(query, RewriterConstants.REWRITES_AS_UNIT_EQUIV); if(rewritesAsUnitEquiv==null) { rewritesAsUnitEquiv = "false"; } Set matches; // Partial Phrase Matching if(partialPhraseMatch.equalsIgnoreCase("true")) { RewriterUtils.log(logger, query, "Partial phrase matching"); // Retrieve longest non overlapping matches matches = RewriterFeatures.getNonOverlappingPartialPhraseMatches(phraseMatcher, query); // Full Phrase Matching if set to anything else } else { RewriterUtils.log(logger, query, "Full phrase matching"); // Retrieve longest non overlapping matches matches = RewriterFeatures.getNonOverlappingFullPhraseMatches(phraseMatcher, query); } if(matches==null) { return result; } // Add expansions to the query query = RewriterFeatures.addExpansions(query, matches, null, maxNumRewrites, false, rewritesAsUnitEquiv.equalsIgnoreCase("true")); rewritten = true; RewriterUtils.log(logger, query, "GenericExpansionRewriter final query: " + query.toDetailString()); result.put(RewriterConstants.REWRITTEN, rewritten); return result; } /** * Get the flag which specifies whether this rewriter * should be skipped if the query has been rewritten * * @return true if rewriter should be skipped, false * otherwise */ public boolean getSkipRewriterIfRewritten() { return SKIP_REWRITER_IF_REWRITTEN; } /** * Get the name of the rewriter * * @return Name of the rewriter */ public String getRewriterName() { return REWRITER_NAME; } /** * Get default FSA dictionary names * * @return Pair of FSA dictionary name and filename */ public HashMap getDefaultFSAs() { HashMap defaultDicts = new HashMap<>(); defaultDicts.put(GENERIC_EXPAND_DICT, GENERIC_EXPAND_DICT_FILENAME); return defaultDicts; } }