diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-01-20 15:12:15 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-01-20 15:12:15 +0100 |
commit | 09caf52b327f6a48af8acf02872a49e08d75c9c9 (patch) | |
tree | 7e3b4422ebe5a6ec71e2b2fbaeabb3cb4306f226 /container-search/src/main/java | |
parent | 262d072c1ac996b34f6c70efc95853be699ca935 (diff) |
Detect language after tokenization
This is a prerequisite to try to be smarter about what subset of the input text is used for language detection,
however it breaks functionality in one subtle way: If an application does not pass language explicitly (such that
it must be detected), and the input is CJK, and there are configured special tokens, those special tokens will
not be detected if they are surrounded by word characters (instead of e.g space).
Diffstat (limited to 'container-search/src/main/java')
11 files changed, 44 insertions, 50 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/IndexFacts.java b/container-search/src/main/java/com/yahoo/prelude/IndexFacts.java index 3631dedeffc..3f931c92489 100644 --- a/container-search/src/main/java/com/yahoo/prelude/IndexFacts.java +++ b/container-search/src/main/java/com/yahoo/prelude/IndexFacts.java @@ -18,11 +18,11 @@ import static com.yahoo.text.Lowercase.toLowerCase; * session.getIndex(indexName).[get index info] * </code></pre> * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ // TODO: We should replace this with a better representation of search definitions // which is immutable, models clusters and search definitions inside clusters properly, -// and uses better names. +// and uses better names. -bratseth public class IndexFacts { private Map<String, List<String>> clusterByDocument; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java index 0dd8e1c36cc..38e2b82ea35 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java @@ -10,7 +10,7 @@ import com.yahoo.prelude.query.textualrepresentation.Discloser; * extend AndItem to avoid code using instanceof handling it as an * AndItem. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ public abstract class SegmentItem extends CompositeItem implements BlockItem { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java index 5051108ea9b..da73aab3396 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java @@ -134,14 +134,15 @@ public abstract class AbstractParser implements CustomParser { @Override public final Item parse(String queryToParse, String filterToParse, Language parsingLanguage, IndexFacts.Session indexFacts, String defaultIndexName) { - if (queryToParse == null) { - return null; - } + if (queryToParse == null) return null; + + tokenize(queryToParse, defaultIndexName, indexFacts, parsingLanguage); + if (parsingLanguage == null) { parsingLanguage = environment.getLinguistics().getDetector().detect(queryToParse, null).getLanguage(); } setState(parsingLanguage, indexFacts); - tokenize(queryToParse, defaultIndexName, indexFacts); + Item root = parseItems(); if (filterToParse != null) { AnyParser filterParser = new AnyParser(environment); @@ -167,25 +168,19 @@ public abstract class AbstractParser implements CustomParser { * @param defaultIndex The default index to assign. * @param item The item to check. */ - private static void assignDefaultIndex(final String defaultIndex, - final Item item) { - if (defaultIndex == null || item == null) { - return; - } + private static void assignDefaultIndex(final String defaultIndex, Item item) { + if (defaultIndex == null || item == null) return; if (item instanceof IndexedItem) { - final IndexedItem indexName = (IndexedItem) item; + IndexedItem indexName = (IndexedItem) item; - if ("".equals(indexName.getIndexName())) { + if ("".equals(indexName.getIndexName())) indexName.setIndexName(defaultIndex); - } - } else if (item instanceof CompositeItem) { - final Iterator<Item> items = ((CompositeItem) item) - .getItemIterator(); - while (items.hasNext()) { - final Item i = items.next(); - assignDefaultIndex(defaultIndex, i); - } + } + else if (item instanceof CompositeItem) { + Iterator<Item> items = ((CompositeItem)item).getItemIterator(); + while (items.hasNext()) + assignDefaultIndex(defaultIndex, items.next()); } } @@ -215,10 +210,11 @@ public abstract class AbstractParser implements CustomParser { * @param query the string to tokenize. * @param defaultIndexName the name of the index to use as default. * @param indexFacts resolved information about the index we are searching + * @param language the language set for this query, or null if none */ - protected void tokenize(String query, String defaultIndexName, IndexFacts.Session indexFacts) { + protected void tokenize(String query, String defaultIndexName, IndexFacts.Session indexFacts, Language language) { Tokenizer tokenizer = new Tokenizer(environment.getLinguistics()); - tokenizer.setSubstringSpecialTokens(language.isCjk()); + tokenizer.setSubstringSpecialTokens(language != null && language.isCjk()); tokenizer.setSpecialTokens(environment.getSpecialTokens()); tokens.initialize(tokenizer.tokenize(query, defaultIndexName, indexFacts)); } @@ -265,9 +261,8 @@ public abstract class AbstractParser implements CustomParser { // TODO: The segmenting stuff is a mess now, this will fix it: // - Make Segmenter a class which is instantiated per parsing - // - Make the instance know the language, etc and do all dispatching - // internally - // -JSB + // - Make the instance know the language, etc and do all dispatching internally + // -bratseth // TODO: Use segmenting for forced phrase searches? protected Item segment(Token token) { String normalizedToken = normalize(token.toString()); diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AnyParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AnyParser.java index e0089fb89ea..95cce001469 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/AnyParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AnyParser.java @@ -35,7 +35,7 @@ public class AnyParser extends SimpleParser { Item filterRoot; setState(queryLanguage, indexFacts); - tokenize(filter, null, indexFacts); + tokenize(filter, null, indexFacts, queryLanguage); filterRoot = anyItems(true); @@ -134,7 +134,7 @@ public class AnyParser extends SimpleParser { Item applyFilter(Item root, String filter, Language queryLanguage, IndexFacts.Session indexFacts) { setState(queryLanguage, indexFacts); - tokenize(filter, null, indexFacts); + tokenize(filter, null, indexFacts, queryLanguage); return filterItems(root); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java index ba10b7b6ee1..dfd05ca0da5 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java @@ -8,7 +8,7 @@ import com.yahoo.search.query.parser.ParserEnvironment; /** * Parser for queries of type phrase. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ public class PhraseParser extends AbstractParser { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/SimpleParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/SimpleParser.java index 6117e8e29ed..d1df74fcfa5 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/SimpleParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/SimpleParser.java @@ -13,7 +13,7 @@ import static com.yahoo.prelude.query.parser.Token.Kind.SPACE; * Base class for parsers of the "simple" query languages (query types * ANY and ALL). * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ abstract class SimpleParser extends StructuredParser { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java index eb35655e4ca..fd1617b5350 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java @@ -15,7 +15,7 @@ import static com.yahoo.prelude.query.parser.Token.Kind.*; * Base class for parsers of the query languages which can be used * for structured queries (types ANY, ALL and ADVANCED). * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ abstract class StructuredParser extends AbstractParser { @@ -430,9 +430,7 @@ abstract class StructuredParser extends AbstractParser { Item item = null; try { - if (item == null) { - item = word(); - } + item = word(); if (item == null && tokens.currentIs(NUMBER)) { Token t = tokens.next(); @@ -542,7 +540,7 @@ abstract class StructuredParser extends AbstractParser { PhraseItem phrase = null; Item firstWord = null; boolean starAfterFirst = false; - boolean starBeforeFirst = false; + boolean starBeforeFirst; if (tokens.skipMultiple(QUOTE)) { quoted = !quoted; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java index 582395bc738..ed0af8d5060 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java @@ -16,7 +16,7 @@ import static com.yahoo.prelude.query.parser.Token.Kind.*; /** * Query tokenizer. Singlethreaded. * - * @author bratseth + * @author bratseth */ public final class Tokenizer { @@ -28,7 +28,7 @@ public final class Tokenizer { private SpecialTokens specialTokens = null; /** Whether to recognize tokens also as substrings of other tokens, needed for cjk */ - private boolean substringSpecialTokens=false; + private boolean substringSpecialTokens = false; private final CharacterClasses characterClasses; @@ -53,7 +53,7 @@ public final class Tokenizer { /** Sets whether to recognize tokens also as substrings of other tokens, needed for cjk. Default false. */ public void setSubstringSpecialTokens(boolean substringSpecialTokens) { - this.substringSpecialTokens=substringSpecialTokens; + this.substringSpecialTokens = substringSpecialTokens; } /** diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java index 009c11ab1fd..b04ac2fcec5 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java @@ -26,21 +26,21 @@ import com.yahoo.search.searchchain.Execution; import com.yahoo.search.searchchain.PhaseNames; /** - * Search to do necessary transforms if the query is in segmented in - * a "CJK language". + * Search to do necessary transforms if the query is in segmented in a CJK language. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ @After(PhaseNames.UNBLENDED_RESULT) @Before(STEMMING) @Provides(CJKSearcher.TERM_ORDER_RELAXATION) public class CJKSearcher extends Searcher { + public static final String TERM_ORDER_RELAXATION = "TermOrderRelaxation"; @Override public Result search(Query query, Execution execution) { - Language l = query.getModel().getParsingLanguage(); - if (!l.isCjk()) return execution.search(query); + Language language = query.getModel().getParsingLanguage(); + if ( ! language.isCjk()) return execution.search(query); QueryTree tree = query.getModel().getQueryTree(); tree.setRoot(transform(tree.getRoot())); @@ -82,7 +82,6 @@ public class CJKSearcher extends Searcher { return root; } - private boolean hasOverlappingTokens(PhraseItem phrase) { boolean has = false; for (Iterator<Item> i = phrase.getItemIterator(); i.hasNext(); ) { @@ -108,4 +107,5 @@ public class CJKSearcher extends Searcher { } return segmentsLength > segments.getRawWord().length(); } + } diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java index ca8214f35d6..a4562892d0c 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java @@ -78,11 +78,11 @@ public class StemmingSearcher extends Searcher { public String getFunctionName() { return "Stemming"; } private Item replaceTerms(Query q, IndexFacts.Session indexFacts) { - Language l = q.getModel().getParsingLanguage(); - if (l == Language.UNKNOWN) { + Language language = q.getModel().getParsingLanguage(); + if (language == Language.UNKNOWN) { return q.getModel().getQueryTree().getRoot(); } - return scan(q.getModel().getQueryTree().getRoot(), l.isCjk(), l, indexFacts, + return scan(q.getModel().getQueryTree().getRoot(), language.isCjk(), language, indexFacts, createReverseConnectivities(q.getModel().getQueryTree().getRoot())); } diff --git a/container-search/src/main/java/com/yahoo/search/query/Model.java b/container-search/src/main/java/com/yahoo/search/query/Model.java index c155ed4fbbd..ca6f7efaa5e 100644 --- a/container-search/src/main/java/com/yahoo/search/query/Model.java +++ b/container-search/src/main/java/com/yahoo/search/query/Model.java @@ -74,7 +74,7 @@ public class Model implements Cloneable { private String filter = null; private Language language = null; private Locale locale = null; - private QueryTree queryTree = null; // The actual query. This is lazily created from the program + private QueryTree queryTree = null; // The query tree to execute. This is lazily created from the program private String defaultIndex = null; private Query.Type type = Query.Type.ALL; private Query parent; @@ -431,7 +431,8 @@ public class Model implements Cloneable { return (Model)q.properties().get(argumentTypeName); } - public @Override String toString() { + @Override + public String toString() { return "query representation [queryTree: " + queryTree + ", filter: " + filter + "]"; } |