indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.indexinglanguage.expressions;

import com.yahoo.document.DataType;
import com.yahoo.document.annotation.AnnotationTypes;
import com.yahoo.document.annotation.Span;
import com.yahoo.document.annotation.SpanList;
import com.yahoo.document.annotation.SpanTree;
import com.yahoo.document.annotation.SpanTrees;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.Linguistics;
import com.yahoo.language.process.GramSplitter;
import com.yahoo.language.process.TokenType;
import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator;

import java.util.Iterator;

/**
 * A filter which splits incoming text into n-grams.
 *
 * @author bratseth
 */
public final class NGramExpression extends Expression {

    private final Linguistics linguistics;
    private final int gramSize;

    /**
     * Creates an executable ngram expression
     *
     * @param linguistics the gram splitter to use, or null if this is used for representation and will not be executed
     * @param gramSize the gram size
     */
    public NGramExpression(Linguistics linguistics, int gramSize) {
        super(DataType.STRING);
        this.linguistics = linguistics;
        this.gramSize = gramSize;
    }

    public Linguistics getLinguistics() {
        return linguistics;
    }

    public int getGramSize() {
        return gramSize;
    }

    @Override
    protected void doExecute(ExecutionContext context) {
        StringFieldValue input = (StringFieldValue) context.getValue();
        if (input.getSpanTree(SpanTrees.LINGUISTICS) != null) {
            // This expression is already executed for this input instance
            return;
        }
        StringFieldValue output = input.clone();
        context.setValue(output);

        SpanList spanList = output.setSpanTree(new SpanTree(SpanTrees.LINGUISTICS)).spanList();
        int lastPosition = 0;
        for (Iterator<GramSplitter.Gram> it = linguistics.getGramSplitter().split(output.getString(), gramSize); it.hasNext();) {
            GramSplitter.Gram gram = it.next();
            // if there is a gap before this gram, then annotate the gram as punctuation
            // (technically it may be of various types, but it does not matter - we just
            // need to annotate it somehow (as a non-term) to make sure it is added to the summary)
            if (lastPosition < gram.getStart()) {
                typedSpan(lastPosition, gram.getStart() - lastPosition, TokenType.PUNCTUATION, spanList);
            }

            // annotate gram as a word term
            String gramString = gram.extractFrom(output.getString());
            typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList).
                    annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString));

            lastPosition = gram.getStart() + gram.getCodePointCount();
        }
        // handle punctuation at the end
        if (lastPosition < output.toString().length()) {
            typedSpan(lastPosition, output.toString().length() - lastPosition, TokenType.PUNCTUATION, spanList);
        }
    }

    private Span typedSpan(int from, int length, TokenType tokenType, SpanList spanList) {
        return (Span)spanList.span(from, length).annotate(AnnotationTypes.TOKEN_TYPE, tokenType.getValue());
    }

    @Override
    protected void doVerify(VerificationContext context) {
        // empty
    }

    @Override
    public DataType createdOutputType() {
        return null;
    }

    @Override
    public String toString() {
        return "ngram " + gramSize;
    }

    @Override
    public boolean equals(Object obj) {
        if (!(obj instanceof NGramExpression rhs)) return false;

        if (linguistics == null) {
            if (rhs.linguistics != null) return false;
        } else if (rhs.linguistics != null) {
            if (linguistics.getClass() != rhs.linguistics.getClass()) return false;
        } else {
            return false;
        }
        if (gramSize != rhs.gramSize) return false;
        return true;
    }

    @Override
    public int hashCode() {
        return getClass().hashCode() + gramSize;
    }

}