aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
blob: ae52ad83e8c854738d2a81d2ba064771969c239f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.indexinglanguage.expressions;

import com.yahoo.document.DataType;
import com.yahoo.document.annotation.*;
import com.yahoo.document.datatypes.IntegerFieldValue;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.Linguistics;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.vespa.indexinglanguage.SimpleTestAdapter;
import org.junit.Test;
import org.mockito.Mockito;

import java.util.Iterator;

import static com.yahoo.vespa.indexinglanguage.expressions.ExpressionAssert.assertVerify;
import static com.yahoo.vespa.indexinglanguage.expressions.ExpressionAssert.assertVerifyThrows;
import static org.junit.Assert.*;

/**
 * @author bratseth
 */
public class NGramTestCase {

    @Test
    public void requireThatAccessorsWork() {
        Linguistics linguistics = new SimpleLinguistics();
        NGramExpression exp = new NGramExpression(linguistics, 69);
        assertSame(linguistics, exp.getLinguistics());
        assertEquals(69, exp.getGramSize());
    }

    @Test
    public void requireThatHashCodeAndEqualsAreImplemented() {
        Linguistics linguistics = new SimpleLinguistics();
        NGramExpression exp = new NGramExpression(linguistics, 69);
        assertFalse(exp.equals(new Object()));
        assertFalse(exp.equals(new NGramExpression(Mockito.mock(Linguistics.class), 96)));
        assertFalse(exp.equals(new NGramExpression(linguistics, 96)));
        assertEquals(exp, new NGramExpression(linguistics, 69));
        assertEquals(exp.hashCode(), new NGramExpression(new SimpleLinguistics(), 69).hashCode());
    }

    @Test
    public void requireThatExpressionCanBeVerified() {
        Expression exp = new NGramExpression(new SimpleLinguistics(), 69);
        assertVerify(DataType.STRING, exp, DataType.STRING);
        assertVerifyThrows(null, exp, "Expected string input, but no input is specified");
        assertVerifyThrows(DataType.INT, exp, "Expected string input, got int");
    }

    @Test
    public void testNGrams() {
        ExecutionContext context = new ExecutionContext(new SimpleTestAdapter());
        context.setValue(new StringFieldValue("en gul Bille sang... "));
        new NGramExpression(new SimpleLinguistics(), 3).execute(context);

        StringFieldValue value = (StringFieldValue)context.getValue();
        assertEquals("Grams are pure annotations - field value is unchanged", "en gul Bille sang... ",
                     value.getString());
        SpanTree gramTree = value.getSpanTree(SpanTrees.LINGUISTICS);
        assertNotNull(gramTree);
        SpanList grams = (SpanList)gramTree.getRoot();
        Iterator<SpanNode> i = grams.childIterator();
        assertSpan(0, 2, true, i, gramTree);  // en
        assertSpan(2, 1, false, i, gramTree); // <space>
        assertSpan(3, 3, true, i, gramTree);  // gul
        assertSpan(6, 1, false, i, gramTree); // <space>
        assertSpan(7, 3, true, i, gramTree, "bil");  // Bil
        assertSpan(8, 3, true, i, gramTree);
        assertSpan(9, 3, true, i, gramTree);
        assertSpan(12, 1, false, i, gramTree); // <space>
        assertSpan(13, 3, true, i, gramTree);
        assertSpan(14, 3, true, i, gramTree);
        assertSpan(17, 4, false, i, gramTree); // <...space>
        assertFalse(i.hasNext());
    }

    @Test
    public void requireThatExecuteCanBeCalledMultipleTimes() {
        ExecutionContext context = new ExecutionContext(new SimpleTestAdapter());
        context.setValue(new StringFieldValue("some random text string"));
        NGramExpression expression = new NGramExpression(new SimpleLinguistics(), 3);

        expression.execute(context);
        SpanTree firstTree = ((StringFieldValue)context.getValue()).getSpanTree(SpanTrees.LINGUISTICS);
        assertNotNull(firstTree);

        expression.execute(context);
        SpanTree secondTree = ((StringFieldValue)context.getValue()).getSpanTree(SpanTrees.LINGUISTICS);
        // The span tree instance should be the same.
        assertEquals(firstTree, secondTree);
    }

    private void assertSpan(int from, int length, boolean gram, Iterator<SpanNode> i, SpanTree tree) {
        assertSpan(from, length, gram, i, tree, null);
    }

    private void assertSpan(int from, int length, boolean gram, Iterator<SpanNode> i, SpanTree tree, String termValue) {
        if (!i.hasNext()) {
            fail("No more spans");
        }
        SpanNode gramSpan = i.next();
        assertEquals("gram start", from, gramSpan.getFrom());
        assertEquals("gram length", length, gramSpan.getLength());
        assertTrue(gramSpan.isLeafNode());
        Iterator<Annotation> annotations = tree.iterator(gramSpan);
        Annotation typeAnnotation = annotations.next();
        assertEquals(AnnotationTypes.TOKEN_TYPE, typeAnnotation.getType());
        int typeInt = ((IntegerFieldValue)typeAnnotation.getFieldValue()).getInteger();
        if (gram) {
            assertEquals(TokenType.ALPHABETIC.getValue(), typeInt);
            Annotation termAnnotation = annotations.next();
            assertEquals(AnnotationTypes.TERM, termAnnotation.getType());
            if (termValue == null) {
                assertNull(termAnnotation.getFieldValue());
            } else {
                assertEquals(termValue, ((StringFieldValue)termAnnotation.getFieldValue()).getString());
            }
        } else { // gap between grams
            assertEquals(TokenType.PUNCTUATION.getValue(), typeInt);
        }
    }
}