linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366

// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/*
 * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed
 * under the terms of the Apache License, Version 2.0.
 */
package com.yahoo.language.simple.kstem;


import java.io.IOException;
import java.io.Reader;

/**
 * {@link CharacterUtils} provides a unified interface to Character-related
 * operations to implement backwards compatible character operations.
 */
public abstract class CharacterUtils {

  private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
  private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();

  /**
   * Returns a {@link CharacterUtils} implementation.
   */
  public static CharacterUtils getInstance() {
    return JAVA_5;
  }

  /**
   * Returns the code point at the given index of the {@link CharSequence}.
   * 
   * @param seq
   *          a character sequence
   * @param offset
   *          the offset to the char values in the chars array to be converted
   * 
   * @return the Unicode code point at the given index
   * @throws NullPointerException
   *           - if the sequence is null.
   * @throws IndexOutOfBoundsException
   *           - if the value offset is negative or not less than the length of
   *           the character sequence.
   */
  public abstract int codePointAt(final CharSequence seq, final int offset);
  
  /**
   * Returns the code point at the given index of the char array where only elements
   * with index less than the limit are used.
   * 
   * @param chars
   *          a character array
   * @param offset
   *          the offset to the char values in the chars array to be converted
   * @param limit the index afer the last element that should be used to calculate
   *        codepoint.  
   * 
   * @return the Unicode code point at the given index
   * @throws NullPointerException
   *           - if the array is null.
   * @throws IndexOutOfBoundsException
   *           - if the value offset is negative or not less than the length of
   *           the char array.
   */
  public abstract int codePointAt(final char[] chars, final int offset, final int limit);

  /** Return the number of characters in <code>seq</code>. */
  public abstract int codePointCount(CharSequence seq);

  /**
   * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
   * of the given bufferSize.
   * 
   * @param bufferSize
   *          the internal char buffer size, must be <code>&gt;= 2</code>
   * @return a new {@link CharacterBuffer} instance.
   */
  public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
    if (bufferSize < 2) {
      throw new IllegalArgumentException("buffersize must be >= 2");
    }
    return new CharacterBuffer(new char[bufferSize], 0, 0);
  }
  
  
  /**
   * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting 
   * at the given offset.
   * @param buffer the char buffer to lowercase
   * @param offset the offset to start at
   * @param limit the max char in the buffer to lower case
   */
  public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
    assert buffer.length >= limit;
    assert offset <=0 && offset <= buffer.length;
    for (int i = offset; i < limit;) {
      i += Character.toChars(
              Character.toLowerCase(
                  codePointAt(buffer, i, limit)), buffer, i);
     }
  }

  /**
   * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting 
   * at the given offset.
   * @param buffer the char buffer to UPPERCASE
   * @param offset the offset to start at
   * @param limit the max char in the buffer to lower case
   */
  public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
    assert buffer.length >= limit;
    assert offset <=0 && offset <= buffer.length;
    for (int i = offset; i < limit;) {
      i += Character.toChars(
              Character.toUpperCase(
                  codePointAt(buffer, i, limit)), buffer, i);
     }
  }

  /** Converts a sequence of Java characters to a sequence of unicode code points.
   *  @return the number of code points written to the destination buffer */
  public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
    if (srcLen < 0) {
      throw new IllegalArgumentException("srcLen must be >= 0");
    }
    int codePointCount = 0;
    for (int i = 0; i < srcLen; ) {
      final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
      final int charCount = Character.charCount(cp);
      dest[destOff + codePointCount++] = cp;
      i += charCount;
    }
    return codePointCount;
  }

  /** Converts a sequence of unicode code points to a sequence of Java characters.
   *  @return the number of chars written to the destination buffer */
  public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
    if (srcLen < 0) {
      throw new IllegalArgumentException("srcLen must be >= 0");
    }
    int written = 0;
    for (int i = 0; i < srcLen; ++i) {
      written += Character.toChars(src[srcOff + i], dest, destOff + written);
    }
    return written;
  }

  /**
   * Fills the {@link CharacterBuffer} with characters read from the given
   * reader {@link Reader}. This method tries to read <code>numChars</code>
   * characters into the {@link CharacterBuffer}, each call to fill will start
   * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
   * In case code points can span across 2 java characters, this method may
   * only fill <code>numChars - 1</code> characters in order not to split in
   * the middle of a surrogate pair, even if there are remaining characters in
   * the {@link Reader}.
   * <p>
   * This method guarantees
   * that the given {@link CharacterBuffer} will never contain a high surrogate
   * character as the last element in the buffer unless it is the last available
   * character in the reader. In other words, high and low surrogate pairs will
   * always be preserved across buffer boarders.
   * </p>
   * <p>
   * A return value of <code>false</code> means that this method call exhausted
   * the reader, but there may be some bytes which have been read, which can be
   * verified by checking whether <code>buffer.getLength() &gt; 0</code>.
   * </p>
   * 
   * @param buffer
   *          the buffer to fill.
   * @param reader
   *          the reader to read characters from.
   * @param numChars
   *          the number of chars to read
   * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
   * @throws IOException
   *           if the reader throws an {@link IOException}.
   */
  public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;

  /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
  public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
    return fill(buffer, reader, buffer.buffer.length);
  }

  /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
   *  code points from <code>index</code>. */
  public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);

  static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
    int read = 0;
    while (read < len) {
      final int r = reader.read(dest, offset + read, len - read);
      if (r == -1) {
        break;
      }
      read += r;
    }
    return read;
  }

  private static final class Java5CharacterUtils extends CharacterUtils {
    Java5CharacterUtils() {
    }

    @Override
    public int codePointAt(final CharSequence seq, final int offset) {
      return Character.codePointAt(seq, offset);
    }

    @Override
    public int codePointAt(final char[] chars, final int offset, final int limit) {
     return Character.codePointAt(chars, offset, limit);
    }

    @Override
    public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
      assert buffer.buffer.length >= 2;
      if (numChars < 2 || numChars > buffer.buffer.length) {
        throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
      }
      final char[] charBuffer = buffer.buffer;
      buffer.offset = 0;
      final int offset;

      // Install the previously saved ending high surrogate:
      if (buffer.lastTrailingHighSurrogate != 0) {
        charBuffer[0] = buffer.lastTrailingHighSurrogate;
        buffer.lastTrailingHighSurrogate = 0;
        offset = 1;
      } else {
        offset = 0;
      }

      final int read = readFully(reader, charBuffer, offset, numChars - offset);

      buffer.length = offset + read;
      final boolean result = buffer.length == numChars;
      if (buffer.length < numChars) {
        // We failed to fill the buffer. Even if the last char is a high
        // surrogate, there is nothing we can do
        return result;
      }

      if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
        buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
      }
      return result;
    }

    @Override
    public int codePointCount(CharSequence seq) {
      return Character.codePointCount(seq, 0, seq.length());
    }

    @Override
    public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
      return Character.offsetByCodePoints(buf, start, count, index, offset);
    }
  }

  private static final class Java4CharacterUtils extends CharacterUtils {
    Java4CharacterUtils() {
    }

    @Override
    public int codePointAt(final CharSequence seq, final int offset) {
      return seq.charAt(offset);
    }

    @Override
    public int codePointAt(final char[] chars, final int offset, final int limit) {
      if(offset >= limit)
        throw new IndexOutOfBoundsException("offset must be less than limit");
      return chars[offset];
    }

    @Override
    public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
        throws IOException {
      assert buffer.buffer.length >= 1;
      if (numChars < 1 || numChars > buffer.buffer.length) {
        throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
      }
      buffer.offset = 0;
      final int read = readFully(reader, buffer.buffer, 0, numChars);
      buffer.length = read;
      buffer.lastTrailingHighSurrogate = 0;
      return read == numChars;
    }

    @Override
    public int codePointCount(CharSequence seq) {
      return seq.length();
    }

    @Override
    public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
      final int result = index + offset;
      if (result < 0 || result > count) {
        throw new IndexOutOfBoundsException();
      }
      return result;
    }

  }
  
  /**
   * A simple IO buffer to use with
   * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
   */
  public static final class CharacterBuffer {
    
    private final char[] buffer;
    private int offset;
    private int length;
    // NOTE: not private so outer class can access without
    // $access methods:
    char lastTrailingHighSurrogate;
    
    CharacterBuffer(char[] buffer, int offset, int length) {
      this.buffer = buffer;
      this.offset = offset;
      this.length = length;
    }
    
    /**
     * Returns the internal buffer
     * 
     * @return the buffer
     */
    public char[] getBuffer() {
      return buffer;
    }
    
    /**
     * Returns the data offset in the internal buffer.
     * 
     * @return the offset
     */
    public int getOffset() {
      return offset;
    }
    
    /**
     * Return the length of the data in the internal buffer starting at
     * {@link #getOffset()}
     * 
     * @return the length
     */
    public int getLength() {
      return length;
    }
    
    /**
     * Resets the CharacterBuffer. All internals are reset to its default
     * values.
     */
    public void reset() {
      offset = 0;
      length = 0;
      lastTrailingHighSurrogate = 0;
    }
  }

}