blob: b04ac2fcec54352412a28a7f7e26fc0e00ca6eb8 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.prelude.querytransform;
import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING;
import java.util.Iterator;
import java.util.ListIterator;
import com.yahoo.language.Language;
import com.yahoo.search.Query;
import com.yahoo.search.Result;
import com.yahoo.component.chain.dependencies.After;
import com.yahoo.component.chain.dependencies.Before;
import com.yahoo.component.chain.dependencies.Provides;
import com.yahoo.prelude.query.AndItem;
import com.yahoo.prelude.query.AndSegmentItem;
import com.yahoo.prelude.query.CompositeItem;
import com.yahoo.prelude.query.Item;
import com.yahoo.prelude.query.PhraseItem;
import com.yahoo.prelude.query.PhraseSegmentItem;
import com.yahoo.prelude.query.SegmentItem;
import com.yahoo.prelude.query.WordItem;
import com.yahoo.search.Searcher;
import com.yahoo.search.query.QueryTree;
import com.yahoo.search.searchchain.Execution;
import com.yahoo.search.searchchain.PhaseNames;
/**
* Search to do necessary transforms if the query is in segmented in a CJK language.
*
* @author Steinar Knutsen
*/
@After(PhaseNames.UNBLENDED_RESULT)
@Before(STEMMING)
@Provides(CJKSearcher.TERM_ORDER_RELAXATION)
public class CJKSearcher extends Searcher {
public static final String TERM_ORDER_RELAXATION = "TermOrderRelaxation";
@Override
public Result search(Query query, Execution execution) {
Language language = query.getModel().getParsingLanguage();
if ( ! language.isCjk()) return execution.search(query);
QueryTree tree = query.getModel().getQueryTree();
tree.setRoot(transform(tree.getRoot()));
query.trace("Rewriting for CJK behavior for implicit phrases", true, 2);
return execution.search(query);
}
private Item transform(Item root) {
if (root instanceof PhraseItem) {
PhraseItem asPhrase = (PhraseItem) root;
if (asPhrase.isExplicit() || hasOverlappingTokens(asPhrase)) return root;
AndItem replacement = new AndItem();
for (ListIterator<Item> i = ((CompositeItem) root).getItemIterator(); i.hasNext();) {
Item item = i.next();
if (item instanceof WordItem) replacement.addItem(item);
else if (item instanceof PhraseSegmentItem) {
replacement.addItem(new AndSegmentItem((PhraseSegmentItem) item));
}
else replacement.addItem(item); // should never run, but hey... just convert and hope it's OK :)
}
return replacement;
} else if (root instanceof PhraseSegmentItem) {
PhraseSegmentItem asSegment = (PhraseSegmentItem) root;
if (asSegment.isExplicit() || hasOverlappingTokens(asSegment)) return root;
else return new AndSegmentItem(asSegment);
} else if (root instanceof SegmentItem) {
return root; // avoid descending into AndSegmentItems and similar
} else if (root instanceof CompositeItem) {
for (ListIterator<Item> i = ((CompositeItem) root).getItemIterator(); i.hasNext();) {
Item item = i.next();
Item transformedItem = transform(item);
if (item != transformedItem) {
i.set(transformedItem);
}
}
return root;
}
return root;
}
private boolean hasOverlappingTokens(PhraseItem phrase) {
boolean has = false;
for (Iterator<Item> i = phrase.getItemIterator(); i.hasNext(); ) {
Item segment = i.next();
if (segment instanceof PhraseSegmentItem) has = hasOverlappingTokens((PhraseSegmentItem) segment);
if (has) return true;
}
return has;
}
/*
* We have overlapping tokens (see
* com.yahoo.prelude.querytransform.test.CJKSearcherTestCase
* .testCjkQueryWithOverlappingTokens and ParseTestCase for an explanation)
* if the sum of length of tokens is greater than the lenght of the original
* word
*/
private boolean hasOverlappingTokens(PhraseSegmentItem segments) {
int segmentsLength=0;
for (Iterator<Item> i = segments.getItemIterator(); i.hasNext(); ) {
WordItem segment = (WordItem) i.next();
segmentsLength += segment.getWord().length();
}
return segmentsLength > segments.getRawWord().length();
}
}
|