diff options
author | Jon Bratseth <bratseth@verizonmedia.com> | 2019-08-26 12:44:11 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@verizonmedia.com> | 2019-08-26 12:44:11 +0200 |
commit | 072a47d3e0c9352add3e5a3cf9d6d7bd22ec04ff (patch) | |
tree | 18955c2a837c4cda1ba9c23e585c99ffc9523abb /container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java | |
parent | 1baf860bedd3b2193799ddd3a4a5b9dc50cb5c6a (diff) |
Support index-command phrase-segmenting
- Set connectivity to max between items in implicit phrases (always)
- Allow indexes to choose to represent implicit phrases by and in
legacy parsers by adding index-command: phrase-segmenting false
Diffstat (limited to 'container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java')
-rw-r--r-- | container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java | 42 |
1 files changed, 28 insertions, 14 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java index 8297a566a72..cd8579be7f0 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java @@ -228,13 +228,13 @@ public abstract class AbstractParser implements CustomParser { protected abstract Item parseItems(); /** - * Assigns the default index to query terms having no default index The - * parser _should_ have done this, for some reason it doesn't + * Assigns the default index to query terms having no default index. The + * parser _should_ have done this, for some reason it doesn't. * - * @param defaultIndex The default index to assign. - * @param item The item to check. + * @param defaultIndex the default index to assign + * @param item the item to check */ - private static void assignDefaultIndex(final String defaultIndex, Item item) { + private static void assignDefaultIndex(String defaultIndex, Item item) { if (defaultIndex == null || item == null) return; if (item instanceof IndexedItem) { @@ -253,9 +253,6 @@ public abstract class AbstractParser implements CustomParser { /** * Unicode normalizes some piece of natural language text. The chosen form * is compatibility decomposition, canonical composition (NFKC). - * - * @param input The string to normalize. - * @return The normalized string. */ protected String normalize(String input) { if (input == null || input.length() == 0) return input; @@ -272,8 +269,8 @@ public abstract class AbstractParser implements CustomParser { /** * Tokenizes the given string and initializes tokens with the found tokens. * - * @param query the string to tokenize. - * @param defaultIndexName the name of the index to use as default. + * @param query the string to tokenize + * @param defaultIndexName the name of the index to use as default * @param indexFacts resolved information about the index we are searching * @param language the language set for this query, or null if none */ @@ -324,6 +321,13 @@ public abstract class AbstractParser implements CustomParser { } } + /** + * Segments a token + * + * @param indexName the index name which preceeded this token, or null if none + * @param token the token to segment + * @return the resulting item + */ // TODO: The segmenting stuff is a mess now, this will fix it: // - Make Segmenter a class which is instantiated per parsing // - Make the instance know the language, etc and do all dispatching internally @@ -331,13 +335,13 @@ public abstract class AbstractParser implements CustomParser { // TODO: Use segmenting for forced phrase searches? // // Language detection currently depends on tokenization (see generateLanguageDetectionTextFrom), but - // - the API's was originally not constructed for that, so a careful nd somewhat unsatisfactory dance - // most be carried out to make it work + // - the API's was originally not constructed for that, so a careful and somewhat unsatisfactory dance + // must be carried out to make it work // - it should really depend on parsing // This can be solved by making the segment method language independent by // always producing a query item containing the token text and resolve it to a WordItem or // SegmentItem after parsing and language detection. - protected Item segment(Token token) { + protected Item segment(String indexName, Token token) { String normalizedToken = normalize(token.toString()); if (token.isSpecial()) { @@ -361,13 +365,23 @@ public abstract class AbstractParser implements CustomParser { return new WordItem(segments.get(0), "", true, token.substring); } - CompositeItem composite = new PhraseSegmentItem(token.toString(), normalizedToken, true, false, token.substring); + CompositeItem composite; + if (indexFacts.getIndex(indexName).getPhraseSegmenting()) { + composite = new PhraseSegmentItem(token.toString(), normalizedToken, true, false, token.substring); + } + else { + composite = new AndSegmentItem(token.toString(), true, false); + } int n = 0; + WordItem previous = null; for (String segment : segments) { WordItem w = new WordItem(segment, "", true, token.substring); w.setFromSegmented(true); w.setSegmentIndex(n++); w.setStemmed(false); + if (previous != null) + previous.setConnectivity(w, 1.0); + previous = w; composite.addItem(w); } composite.lock(); |