linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;


import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

/**
 * A class which splits consecutive word character sequences into overlapping character n-grams.
 * For example "en gul bille sang" split into 2-grams becomes
 * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang".
 * <p>
 * This class is multithread safe.
 *
 * @author bratseth
 */
public class GramSplitter {

    private final CharacterClasses characterClasses;

    public GramSplitter(CharacterClasses characterClasses) {
        this.characterClasses = characterClasses;
    }

    /**
     * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length]
     * pairs into the input string.
     * <p>
     * The iterator is implemented as a sliding view over the input string rather than being backed by a
     * list, which makes this space efficient for large strings.
     *
     * @param input the input string to be split, cannot be null
     * @param n     the gram size, a positive integer
     * @return a read only iterator over the resulting grams
     * @throws NullPointerException     if input==null
     * @throws IllegalArgumentException if n is less than 1
     */
    public GramSplitterIterator split(String input, int n) {
        if (input == null) throw new NullPointerException("input cannot be null");
        if (n < 1) throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n);
        return new GramSplitterIterator(input, n, characterClasses);
    }

    public static class GramSplitterIterator implements Iterator<Gram> {

        private final CharacterClasses characterClasses;

        /** Text to split */
        private final UnicodeString input;

        /** Gram size in code points */
        private final int n;

        /** Current position in the string */
        private int i = 0;

        /** Whether the last thing that happened was being on a separator (including the start of the string) */
        private boolean isFirstAfterSeparator = true;

        /** The next gram or null if not determined yet */
        private Gram nextGram = null;

        public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) {
            this.input = new UnicodeString(input);
            this.n = n;
            this.characterClasses = characterClasses;
        }

        @Override
        public boolean hasNext() {
            if (nextGram != null) return true;
            nextGram = findNext();
            return nextGram != null;
        }

        @Override
        public Gram next() {
            Gram currentGram = nextGram;
            if (currentGram == null)
                currentGram = findNext();
            if (currentGram == null)
                throw new NoSuchElementException("No next gram at position " + i);
            nextGram = null;
            return currentGram;
        }

        private Gram findNext() {
            // Skip to next indexable character
            while (i < input.length() && !isIndexable(input.codePointAt(i))) {
                i = input.next(i);
                isFirstAfterSeparator = true;
            }
            if (i >= input.length()) return null; // no indexable characters

            int tokenStart = i;
            UnicodeString gram = input.substring(tokenStart, n);
            int tokenEnd = tokenEnd(gram);
            gram = new UnicodeString(gram.toString().substring(0, tokenEnd));
            if (gram.codePointCount() == n) { // normal case: got a full length gram
                Gram g = new Gram(i, gram.codePointCount());
                i = input.next(i);
                isFirstAfterSeparator = false;
                return g;
            }
            else { // gram is too short due either to being a symbol, being followed by a non-word separator, or end of string
                if (isFirstAfterSeparator || ( gram.codePointCount() == 1 && characterClasses.isSymbol(gram.codePointAt(0)))) { // make a gram anyway
                    Gram g = new Gram(i, gram.codePointCount());
                    i = input.next(i);
                    isFirstAfterSeparator = false;
                    return g;
                } else { // skip to next
                    i = input.skip(gram.codePointCount(), i);
                    isFirstAfterSeparator = true;
                    return findNext();
                }
            }
        }

        private boolean isIndexable(int codepoint) {
            if (characterClasses.isLetterOrDigit(codepoint)) return true;
            if (characterClasses.isSymbol(codepoint)) return true;
            return false;
        }

        /** Given a string s starting by an indexable character, return the position where that token should end. */
        private int tokenEnd(UnicodeString s) {
            if (characterClasses.isSymbol(s.codePointAt(0)))
                return s.next(0); // symbols have length 1

            int i = 0;
            for (; i < s.length(); i = s.next(i)) {
                if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i)))
                    return i;
            }
            return i;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("This iterator is read only");
        }

        /**
         * Convenience list which splits the remaining items in this iterator into a list of gram strings
         *
         * @return an immutable list of extracted grams
         */
        public List<String> toExtractedList() {
            List<String> gramList = new ArrayList<>();
            while (hasNext())
                gramList.add(next().extractFrom(input));
            return Collections.unmodifiableList(gramList);
        }
    }

    /**
     * An immutable start index and length pair
     */
    public static final class Gram {

        private final int start, codePointCount;

        public Gram(int start, int codePointCount) {
            this.start = start;
            this.codePointCount = codePointCount;
        }

        public int getStart() {
            return start;
        }

        public int getCodePointCount() {
            return codePointCount;
        }

        /** Returns this gram as a string from the input string */
        public String extractFrom(String input) {
            return extractFrom(new UnicodeString(input));
        }

        /** Returns this gram as a string from the input string */
        public String extractFrom(UnicodeString input) {
            return input.substring(start, codePointCount).toString();
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if ( ! (o instanceof Gram)) return false;

            Gram gram = (Gram)o;
            if (codePointCount != gram.codePointCount) return false;
            if (start != gram.start) return false;
            return true;
        }

        @Override
        public int hashCode() {
            int result = start;
            result = 31 * result + codePointCount;
            return result;
        }

    }

    /**
     * A string wrapper with some convenience methods for dealing with UTF-16 surrogate pairs
     * (a crime against humanity for which we'll be negatively impacted for at least the next million years).
     */
    private static class UnicodeString {

        private final String s;

        public UnicodeString(String s) {
            this.s = s;
        }

        /** Substring in code point space */
        public UnicodeString substring(int start, int codePoints) {
            int cps = codePoints * 2 <= s.length() - start ? codePoints
                                                           : Math.min(codePoints, s.codePointCount(start, s.length()));
            return new UnicodeString(s.substring(start, s.offsetByCodePoints(start, cps)));
        }

        /** Returns the position count code points after start (which may be past the end of the string) */
        public int skip(int codePointCount, int start) {
            int index = start;
            for (int i = 0; i < codePointCount; i++) {
                index = next(index);
                if (index > s.length()) break;
            }
            return index;
        }

        /** Returns the index of the next code point after start (which may be past the end of the string) */
        public int next(int index) {
            int next = index + 1;
            if (next < s.length() && Character.isLowSurrogate(s.charAt(next)))
                next++;
            return next;
        }

        /** Returns the number of positions (not code points) in this */
        public int length() { return s.length(); }

        /** Returns the number of code points in this */
        public int codePointCount() { return s.codePointCount(0, s.length()); }

        public int codePointAt(int index) {
            return s.codePointAt(index);
        }

        @Override
        public String toString() { return s; }

    }

}