diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-06-24 11:42:26 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-06-24 11:42:26 +0200 |
commit | 547cee55234eb52cc8332381ee6dff219c5cd1f9 (patch) | |
tree | 06249d92218694c2e9a918277d8f53d4ca7291a4 /container-search/src/main/java/com | |
parent | c2b57fddcbc9beb3b866d579c6cd68a68590651e (diff) |
Segment to phrase not and when inside an expicit phrase
Diffstat (limited to 'container-search/src/main/java/com')
5 files changed, 43 insertions, 34 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java index 542f1393852..9b34fd7d62b 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java @@ -19,16 +19,13 @@ public class PhraseSegmentItem extends IndexedSegmentItem { /** Whether this was explicitly written as a phrase using quotes by the user */ private boolean explicit = false; - /** - * Creates a phrase containing the same words and state (as pertinent) as - * the given SegmentAndItem. - */ - public PhraseSegmentItem(AndSegmentItem segAnd) { - super(segAnd.getRawWord(), segAnd.stringValue(), segAnd.isFromQuery(), segAnd.isStemmed(), segAnd.getOrigin()); - if (segAnd.getItemCount() > 0) { - WordItem w = (WordItem) segAnd.getItem(0); + /** Creates a phrase containing the same words and state (as pertinent) as the given SegmentAndItem. */ + public PhraseSegmentItem(AndSegmentItem andSegment) { + super(andSegment.getRawWord(), andSegment.stringValue(), andSegment.isFromQuery(), andSegment.isStemmed(), andSegment.getOrigin()); + if (andSegment.getItemCount() > 0) { + WordItem w = (WordItem) andSegment.getItem(0); setIndexName(w.getIndexName()); - for (Iterator<Item> i = segAnd.getItemIterator(); i.hasNext();) { + for (Iterator<Item> i = andSegment.getItemIterator(); i.hasNext();) { WordItem word = (WordItem) i.next(); addWordItem(word); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java index cd8579be7f0..902be7e15dd 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java @@ -326,6 +326,7 @@ public abstract class AbstractParser implements CustomParser { * * @param indexName the index name which preceeded this token, or null if none * @param token the token to segment + * @param quoted whether this segment is within quoted text * @return the resulting item */ // TODO: The segmenting stuff is a mess now, this will fix it: @@ -341,7 +342,7 @@ public abstract class AbstractParser implements CustomParser { // This can be solved by making the segment method language independent by // always producing a query item containing the token text and resolve it to a WordItem or // SegmentItem after parsing and language detection. - protected Item segment(String indexName, Token token) { + protected Item segment(String indexName, Token token, boolean quoted) { String normalizedToken = normalize(token.toString()); if (token.isSpecial()) { @@ -361,12 +362,13 @@ public abstract class AbstractParser implements CustomParser { if (segments.size() == 0) { return null; } + if (segments.size() == 1) { return new WordItem(segments.get(0), "", true, token.substring); } CompositeItem composite; - if (indexFacts.getIndex(indexName).getPhraseSegmenting()) { + if (indexFacts.getIndex(indexName).getPhraseSegmenting() || quoted) { composite = new PhraseSegmentItem(token.toString(), normalizedToken, true, false, token.substring); } else { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java index 6d4401aca04..12f63276269 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java @@ -23,8 +23,7 @@ public class PhraseParser extends AbstractParser { /** * Ignores everything but words and numbers * - * @return a phrase item if several words/numbers was found, - * a word item if only one was found + * @return a phrase item if several words/numbers was found, a word item if only one was found */ private Item forcedPhrase() { Item firstWord = null; @@ -38,7 +37,7 @@ public class PhraseParser extends AbstractParser { } // Note, this depends on segment never creating AndItems when quoted // (the second argument) is true. - Item newWord = segment(null, token); + Item newWord = segment(null, token, true); if (firstWord == null) { // First pass firstWord = newWord; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java index 9ba6c1a8101..76ea7fb11a8 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java @@ -406,13 +406,18 @@ abstract class StructuredParser extends AbstractParser { } } - /** Words for phrases also permits numerals as words */ - private Item phraseWord(String indexName, boolean insidePhrase) { + /** + * Words for phrases also permits numerals as words + * + * @param quoted whether we are consuming text within quoted + * @param insidePhrase whether we are consuming additional items for an existing phrase + */ + private Item phraseWord(String indexName, boolean quoted, boolean insidePhrase) { int position = tokens.getPosition(); Item item = null; try { - item = word(indexName); + item = word(indexName, quoted); if (item == null && tokens.currentIs(NUMBER)) { Token t = tokens.next(); @@ -434,10 +439,12 @@ abstract class StructuredParser extends AbstractParser { /** * Returns a WordItem if this is a non CJK query, - * a WordItem or PhraseSegmentItem if this is a CJK query, + * a WordItem or SegmentItem if this is a CJK query, * null if the current item is not a word + * + * @param quoted whether this token is inside quotes */ - private Item word(String indexName) { + private Item word(String indexName, boolean quoted) { int position = tokens.getPosition(); Item item = null; @@ -452,7 +459,7 @@ abstract class StructuredParser extends AbstractParser { if (submodes.url) { item = new WordItem(word, true); } else { - item = segment(indexName, word); + item = segment(indexName, word, quoted); } if (submodes.url || submodes.site) { @@ -539,7 +546,7 @@ abstract class StructuredParser extends AbstractParser { quoted = !quoted; } - Item word = phraseWord(indexName, (firstWord != null) || (composite != null)); + Item word = phraseWord(indexName, quoted, (firstWord != null) || (composite != null)); if (word == null) { if (tokens.skipMultiple(QUOTE)) { diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java index ae8c289a5b0..785477d6df7 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java @@ -56,26 +56,31 @@ public class CJKSearcher extends Searcher { AndItem replacement = new AndItem(); for (ListIterator<Item> i = ((CompositeItem) root).getItemIterator(); i.hasNext();) { Item item = i.next(); - if (item instanceof WordItem) replacement.addItem(item); - else if (item instanceof PhraseSegmentItem) { + if (item instanceof WordItem) + replacement.addItem(item); + else if (item instanceof PhraseSegmentItem) replacement.addItem(new AndSegmentItem((PhraseSegmentItem) item)); - } - else replacement.addItem(item); // should never run, but hey... just convert and hope it's OK :) + else + replacement.addItem(item); // should never get here } return replacement; - } else if (root instanceof PhraseSegmentItem) { + } + else if (root instanceof PhraseSegmentItem) { PhraseSegmentItem asSegment = (PhraseSegmentItem) root; - if (asSegment.isExplicit() || hasOverlappingTokens(asSegment)) return root; - else return new AndSegmentItem(asSegment); - } else if (root instanceof SegmentItem) { + if (asSegment.isExplicit() || hasOverlappingTokens(asSegment)) + return root; + else + return new AndSegmentItem(asSegment); + } + else if (root instanceof SegmentItem) { return root; // avoid descending into AndSegmentItems and similar - } else if (root instanceof CompositeItem) { + } + else if (root instanceof CompositeItem) { for (ListIterator<Item> i = ((CompositeItem) root).getItemIterator(); i.hasNext();) { Item item = i.next(); Item transformedItem = transform(item); - if (item != transformedItem) { + if (item != transformedItem) i.set(transformedItem); - } } return root; } @@ -96,8 +101,7 @@ public class CJKSearcher extends Searcher { * We have overlapping tokens (see * com.yahoo.prelude.querytransform.test.CJKSearcherTestCase * .testCjkQueryWithOverlappingTokens and ParseTestCase for an explanation) - * if the sum of length of tokens is greater than the lenght of the original - * word + * if the sum of length of tokens is greater than the length of the original word */ private boolean hasOverlappingTokens(PhraseSegmentItem segments) { int segmentsLength=0; |