aboutsummaryrefslogtreecommitdiffstats
path: root/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java
blob: d1514267a9bdeb72d54ce7d5019f7f07043230a7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.prelude.querytransform.test;

import com.yahoo.component.chain.Chain;
import com.yahoo.config.subscription.ConfigGetter;
import com.yahoo.container.QrSearchersConfig;
import com.yahoo.language.Linguistics;
import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.prelude.IndexFacts;
import com.yahoo.prelude.IndexFactsFactory;
import com.yahoo.prelude.IndexModel;
import com.yahoo.prelude.query.*;
import com.yahoo.prelude.querytransform.StemmingSearcher;
import com.yahoo.search.Query;
import com.yahoo.search.Searcher;
import com.yahoo.search.config.IndexInfoConfig;
import com.yahoo.search.searchchain.Execution;

import com.yahoo.search.test.QueryTestCase;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

/**
 * @author Mathias M. Lidal
 */
public class StemmingSearcherTestCase {

    private static final Linguistics linguistics = new SimpleLinguistics();
    private final IndexFacts indexFacts = IndexFactsFactory.newInstance("dir:src/test/java/com/yahoo/prelude/" +
                                                                        "querytransform/test/", null);

    @Test
    void testStemOnlySomeTerms() {
        assertStemmed("WEAKAND(100) hole in cvs and subversion nostem:Found", "/search?query=Holes in CVS and Subversion nostem:Found"
                     );
    }

    @Test
    void testPhraseSegmentTransforms() {
        Query q1 = buildQueryWithSegmentPhrase();
        executeStemming(q1);
        assertEquals("AND a 'd e'", q1.getModel().getQueryTree().getRoot().toString());
    }

    private Query buildQueryWithSegmentPhrase() {
        Query q1 = new Query("/search?query=placeholder&language=de");
        q1.getModel().setExecution(newExecution());
        AndItem root = new AndItem();
        root.addItem(new WordItem("a", true));
        // this is a trick, note the string to stem contains space
        PhraseSegmentItem p = new PhraseSegmentItem("d e", true, false);
        p.addItem(new WordItem("b", true));
        p.addItem(new WordItem("c", true));
        p.lock();
        root.addItem(p);
        q1.getModel().getQueryTree().setRoot(root);
        assertEquals("AND a 'b c'", q1.getModel().getQueryTree().getRoot().toString());
        return q1;
    }

    @Test
    void testPreserveConnectivityToPhrase() {
        Query q1 = buildQueryWithSegmentPhrase();
        CompositeItem r = (CompositeItem) q1.getModel().getQueryTree().getRoot();
        WordItem first = (WordItem) r.getItem(0);
        PhraseSegmentItem second = (PhraseSegmentItem) r.getItem(1);
        first.setConnectivity(second, 1.0d);
        executeStemming(q1);
        assertEquals("AND a 'd e'", q1.getModel().getQueryTree().getRoot().toString());
        r = (CompositeItem) q1.getModel().getQueryTree().getRoot();
        first = (WordItem) r.getItem(0);
        second = (PhraseSegmentItem) r.getItem(1);
        var origSecond = first.getConnectedItem();
        assertEquals(second, first.getConnectedItem(), "Connectivity incorrect.");
    }

    @Test
    void testDontStemPrefixes() {
        assertStemmed("WEAKAND(100) ist*", "/search?query=ist*&language=de");
    }

    @Test
    void testStemming() {
        Query query = new Query("/search?query=");
        executeStemming(query);
        assertTrue(query.getModel().getQueryTree().getRoot() instanceof NullItem);
    }

    @Test
    void testNounStemming() {
        assertStemmed("WEAKAND(100) noun:tower noun:tower noun:tow", "/search?query=noun:towers noun:tower noun:tow"
                     );
        assertStemmed("WEAKAND(100) notnoun:tower notnoun:tower notnoun:tow", "/search?query=notnoun:towers notnoun:tower notnoun:tow"
                     );
    }

    @SuppressWarnings("deprecation")
    @Test
    void testEmptyIndexInfo() {
        String indexInfoConfigID = "file:src/test/java/com/yahoo/prelude/querytransform/test/emptyindexinfo.cfg";
        ConfigGetter<IndexInfoConfig> getter = new ConfigGetter<>(IndexInfoConfig.class);
        IndexInfoConfig config = getter.getConfig(indexInfoConfigID);

        IndexFacts indexFacts = new IndexFacts(new IndexModel(config, (QrSearchersConfig) null));

        Query q = new Query(QueryTestCase.httpEncode("?query=cars"));
        new Execution(new Chain<Searcher>(new StemmingSearcher(linguistics)),
                Execution.Context.createContextStub(indexFacts, linguistics)).search(q);
        assertEquals("WEAKAND(100) cars", q.getModel().getQueryTree().getRoot().toString());
    }

    @Test
    void testLiteralBoost() {
        Query q = new Query(QueryTestCase.httpEncode("/search?language=en&search=three"));
        WordItem scratch = new WordItem("trees", true);
        scratch.setStemmed(false);
        q.getModel().getQueryTree().setRoot(scratch);
        executeStemming(q);
        assertTrue(q.getModel().getQueryTree().getRoot() instanceof WordAlternativesItem,
                "Expected a set of word alternatives as root.");
        WordAlternativesItem w = (WordAlternativesItem) q.getModel().getQueryTree().getRoot();
        boolean foundExpectedBaseForm = false;
        for (WordAlternativesItem.Alternative a : w.getAlternatives()) {
            if ("trees".equals(a.word)) {
                assertEquals(1.0d, a.exactness, 1e-15);
                foundExpectedBaseForm = true;
            }
        }
        assertTrue(foundExpectedBaseForm, "Did not find original word form in query.");
    }

    @Test
    void testMultipleStemming() {
        assertStemmed("WEAKAND(100) WORD_ALTERNATIVES foobar:[ tree(0.7) trees(1.0) ] " +
                      "foobar:\"noun girl\" WORD_ALTERNATIVES foobar:[ flower(0.7) flowers(1.0) ] " +
                      "foobar:\"a verb a\" WORD_ALTERNATIVES foobar:[ girl(0.7) girls(1.0) ]",
                      "/search?language=en&search=four&query=trees \"nouns girls\" flowers \"a verbs a\" girls&default-index=foobar");
    }

    @Test
    void testEmojiStemming() {
        String emoji1 = "\uD83C\uDF49"; // 🍉
        String emoji2 = "\uD83D\uDE00"; // 😀
        assertStemmed("WEAKAND(100) " + emoji1, "/search?query=" + emoji1);
        assertStemmed("WEAKAND(100) (AND " + emoji1 + " " + emoji2 + ")", "/search?query=" + emoji1 + emoji2);
        assertStemmed("WEAKAND(100) (AND " + emoji1 + " foo " + emoji2 + ")", "/search?query=" + emoji1 + "foo" + emoji2);
    }

    private Execution.Context newExecutionContext() {
        return Execution.Context.createContextStub(indexFacts, linguistics);
    }

    private Execution newExecution() {
        return new Execution(newExecutionContext());
    }

    private void executeStemming(Query query) {
        new Execution(new Chain<Searcher>(new StemmingSearcher(linguistics)),
                      newExecutionContext()).search(query);
    }

    private void assertStemmed(String expectedQueryTree, String queryString) {
        Query query = new Query(QueryTestCase.httpEncode(queryString));
        executeStemming(query);
        assertEquals(expectedQueryTree, query.getModel().getQueryTree().getRoot().toString());
    }

}