summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java375
1 files changed, 375 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java
new file mode 100644
index 00000000000..91bd6286b28
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java
@@ -0,0 +1,375 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed
+ * under the terms of the Apache License, Version 2.0.
+ */
+package com.yahoo.language.simple.kstem;
+
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * {@link CharacterUtils} provides a unified interface to Character-related
+ * operations to implement backwards compatible character operations.
+ */
+public abstract class CharacterUtils {
+
+ private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
+ private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
+
+ /**
+ * Returns a {@link CharacterUtils} implementation.
+ */
+ public static CharacterUtils getInstance() {
+ return JAVA_5;
+ }
+
+ /**
+ * explicitly returns a version matching java 4 semantics
+ * @deprecated Only for n-gram backwards compat
+ */
+ @Deprecated
+ public static CharacterUtils getJava4Instance() {
+ return JAVA_4;
+ }
+
+ /**
+ * Returns the code point at the given index of the {@link CharSequence}.
+ *
+ * @param seq
+ * a character sequence
+ * @param offset
+ * the offset to the char values in the chars array to be converted
+ *
+ * @return the Unicode code point at the given index
+ * @throws NullPointerException
+ * - if the sequence is null.
+ * @throws IndexOutOfBoundsException
+ * - if the value offset is negative or not less than the length of
+ * the character sequence.
+ */
+ public abstract int codePointAt(final CharSequence seq, final int offset);
+
+ /**
+ * Returns the code point at the given index of the char array where only elements
+ * with index less than the limit are used.
+ *
+ * @param chars
+ * a character array
+ * @param offset
+ * the offset to the char values in the chars array to be converted
+ * @param limit the index afer the last element that should be used to calculate
+ * codepoint.
+ *
+ * @return the Unicode code point at the given index
+ * @throws NullPointerException
+ * - if the array is null.
+ * @throws IndexOutOfBoundsException
+ * - if the value offset is negative or not less than the length of
+ * the char array.
+ */
+ public abstract int codePointAt(final char[] chars, final int offset, final int limit);
+
+ /** Return the number of characters in <code>seq</code>. */
+ public abstract int codePointCount(CharSequence seq);
+
+ /**
+ * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
+ * of the given bufferSize.
+ *
+ * @param bufferSize
+ * the internal char buffer size, must be <code>&gt;= 2</code>
+ * @return a new {@link CharacterBuffer} instance.
+ */
+ public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
+ if (bufferSize < 2) {
+ throw new IllegalArgumentException("buffersize must be >= 2");
+ }
+ return new CharacterBuffer(new char[bufferSize], 0, 0);
+ }
+
+
+ /**
+ * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to lowercase
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toLowerCase(
+ codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /**
+ * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to UPPERCASE
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toUpperCase(
+ codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /** Converts a sequence of Java characters to a sequence of unicode code points.
+ * @return the number of code points written to the destination buffer */
+ public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int codePointCount = 0;
+ for (int i = 0; i < srcLen; ) {
+ final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
+ final int charCount = Character.charCount(cp);
+ dest[destOff + codePointCount++] = cp;
+ i += charCount;
+ }
+ return codePointCount;
+ }
+
+ /** Converts a sequence of unicode code points to a sequence of Java characters.
+ * @return the number of chars written to the destination buffer */
+ public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int written = 0;
+ for (int i = 0; i < srcLen; ++i) {
+ written += Character.toChars(src[srcOff + i], dest, destOff + written);
+ }
+ return written;
+ }
+
+ /**
+ * Fills the {@link CharacterBuffer} with characters read from the given
+ * reader {@link Reader}. This method tries to read <code>numChars</code>
+ * characters into the {@link CharacterBuffer}, each call to fill will start
+ * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+ * In case code points can span across 2 java characters, this method may
+ * only fill <code>numChars - 1</code> characters in order not to split in
+ * the middle of a surrogate pair, even if there are remaining characters in
+ * the {@link Reader}.
+ * <p>
+ * This method guarantees
+ * that the given {@link CharacterBuffer} will never contain a high surrogate
+ * character as the last element in the buffer unless it is the last available
+ * character in the reader. In other words, high and low surrogate pairs will
+ * always be preserved across buffer boarders.
+ * </p>
+ * <p>
+ * A return value of <code>false</code> means that this method call exhausted
+ * the reader, but there may be some bytes which have been read, which can be
+ * verified by checking whether <code>buffer.getLength() &gt; 0</code>.
+ * </p>
+ *
+ * @param buffer
+ * the buffer to fill.
+ * @param reader
+ * the reader to read characters from.
+ * @param numChars
+ * the number of chars to read
+ * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
+ * @throws IOException
+ * if the reader throws an {@link IOException}.
+ */
+ public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
+
+ /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+ public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+ return fill(buffer, reader, buffer.buffer.length);
+ }
+
+ /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
+ * code points from <code>index</code>. */
+ public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
+
+ static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+ int read = 0;
+ while (read < len) {
+ final int r = reader.read(dest, offset + read, len - read);
+ if (r == -1) {
+ break;
+ }
+ read += r;
+ }
+ return read;
+ }
+
+ private static final class Java5CharacterUtils extends CharacterUtils {
+ Java5CharacterUtils() {
+ }
+
+ @Override
+ public int codePointAt(final CharSequence seq, final int offset) {
+ return Character.codePointAt(seq, offset);
+ }
+
+ @Override
+ public int codePointAt(final char[] chars, final int offset, final int limit) {
+ return Character.codePointAt(chars, offset, limit);
+ }
+
+ @Override
+ public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
+ assert buffer.buffer.length >= 2;
+ if (numChars < 2 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+ }
+ final char[] charBuffer = buffer.buffer;
+ buffer.offset = 0;
+ final int offset;
+
+ // Install the previously saved ending high surrogate:
+ if (buffer.lastTrailingHighSurrogate != 0) {
+ charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ buffer.lastTrailingHighSurrogate = 0;
+ offset = 1;
+ } else {
+ offset = 0;
+ }
+
+ final int read = readFully(reader, charBuffer, offset, numChars - offset);
+
+ buffer.length = offset + read;
+ final boolean result = buffer.length == numChars;
+ if (buffer.length < numChars) {
+ // We failed to fill the buffer. Even if the last char is a high
+ // surrogate, there is nothing we can do
+ return result;
+ }
+
+ if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+ buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+ }
+ return result;
+ }
+
+ @Override
+ public int codePointCount(CharSequence seq) {
+ return Character.codePointCount(seq, 0, seq.length());
+ }
+
+ @Override
+ public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+ return Character.offsetByCodePoints(buf, start, count, index, offset);
+ }
+ }
+
+ private static final class Java4CharacterUtils extends CharacterUtils {
+ Java4CharacterUtils() {
+ }
+
+ @Override
+ public int codePointAt(final CharSequence seq, final int offset) {
+ return seq.charAt(offset);
+ }
+
+ @Override
+ public int codePointAt(final char[] chars, final int offset, final int limit) {
+ if(offset >= limit)
+ throw new IndexOutOfBoundsException("offset must be less than limit");
+ return chars[offset];
+ }
+
+ @Override
+ public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
+ throws IOException {
+ assert buffer.buffer.length >= 1;
+ if (numChars < 1 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
+ }
+ buffer.offset = 0;
+ final int read = readFully(reader, buffer.buffer, 0, numChars);
+ buffer.length = read;
+ buffer.lastTrailingHighSurrogate = 0;
+ return read == numChars;
+ }
+
+ @Override
+ public int codePointCount(CharSequence seq) {
+ return seq.length();
+ }
+
+ @Override
+ public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+ final int result = index + offset;
+ if (result < 0 || result > count) {
+ throw new IndexOutOfBoundsException();
+ }
+ return result;
+ }
+
+ }
+
+ /**
+ * A simple IO buffer to use with
+ * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
+ */
+ public static final class CharacterBuffer {
+
+ private final char[] buffer;
+ private int offset;
+ private int length;
+ // NOTE: not private so outer class can access without
+ // $access methods:
+ char lastTrailingHighSurrogate;
+
+ CharacterBuffer(char[] buffer, int offset, int length) {
+ this.buffer = buffer;
+ this.offset = offset;
+ this.length = length;
+ }
+
+ /**
+ * Returns the internal buffer
+ *
+ * @return the buffer
+ */
+ public char[] getBuffer() {
+ return buffer;
+ }
+
+ /**
+ * Returns the data offset in the internal buffer.
+ *
+ * @return the offset
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * Return the length of the data in the internal buffer starting at
+ * {@link #getOffset()}
+ *
+ * @return the length
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Resets the CharacterBuffer. All internals are reset to its default
+ * values.
+ */
+ public void reset() {
+ offset = 0;
+ length = 0;
+ lastTrailingHighSurrogate = 0;
+ }
+ }
+
+}