diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java new file mode 100644 index 00000000000..91bd6286b28 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java @@ -0,0 +1,375 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed + * under the terms of the Apache License, Version 2.0. + */ +package com.yahoo.language.simple.kstem; + + +import java.io.IOException; +import java.io.Reader; + +/** + * {@link CharacterUtils} provides a unified interface to Character-related + * operations to implement backwards compatible character operations. + */ +public abstract class CharacterUtils { + + private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils(); + private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils(); + + /** + * Returns a {@link CharacterUtils} implementation. + */ + public static CharacterUtils getInstance() { + return JAVA_5; + } + + /** + * explicitly returns a version matching java 4 semantics + * @deprecated Only for n-gram backwards compat + */ + @Deprecated + public static CharacterUtils getJava4Instance() { + return JAVA_4; + } + + /** + * Returns the code point at the given index of the {@link CharSequence}. + * + * @param seq + * a character sequence + * @param offset + * the offset to the char values in the chars array to be converted + * + * @return the Unicode code point at the given index + * @throws NullPointerException + * - if the sequence is null. + * @throws IndexOutOfBoundsException + * - if the value offset is negative or not less than the length of + * the character sequence. + */ + public abstract int codePointAt(final CharSequence seq, final int offset); + + /** + * Returns the code point at the given index of the char array where only elements + * with index less than the limit are used. + * + * @param chars + * a character array + * @param offset + * the offset to the char values in the chars array to be converted + * @param limit the index afer the last element that should be used to calculate + * codepoint. + * + * @return the Unicode code point at the given index + * @throws NullPointerException + * - if the array is null. + * @throws IndexOutOfBoundsException + * - if the value offset is negative or not less than the length of + * the char array. + */ + public abstract int codePointAt(final char[] chars, final int offset, final int limit); + + /** Return the number of characters in <code>seq</code>. */ + public abstract int codePointCount(CharSequence seq); + + /** + * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code> + * of the given bufferSize. + * + * @param bufferSize + * the internal char buffer size, must be <code>>= 2</code> + * @return a new {@link CharacterBuffer} instance. + */ + public static CharacterBuffer newCharacterBuffer(final int bufferSize) { + if (bufferSize < 2) { + throw new IllegalArgumentException("buffersize must be >= 2"); + } + return new CharacterBuffer(new char[bufferSize], 0, 0); + } + + + /** + * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting + * at the given offset. + * @param buffer the char buffer to lowercase + * @param offset the offset to start at + * @param limit the max char in the buffer to lower case + */ + public final void toLowerCase(final char[] buffer, final int offset, final int limit) { + assert buffer.length >= limit; + assert offset <=0 && offset <= buffer.length; + for (int i = offset; i < limit;) { + i += Character.toChars( + Character.toLowerCase( + codePointAt(buffer, i, limit)), buffer, i); + } + } + + /** + * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting + * at the given offset. + * @param buffer the char buffer to UPPERCASE + * @param offset the offset to start at + * @param limit the max char in the buffer to lower case + */ + public final void toUpperCase(final char[] buffer, final int offset, final int limit) { + assert buffer.length >= limit; + assert offset <=0 && offset <= buffer.length; + for (int i = offset; i < limit;) { + i += Character.toChars( + Character.toUpperCase( + codePointAt(buffer, i, limit)), buffer, i); + } + } + + /** Converts a sequence of Java characters to a sequence of unicode code points. + * @return the number of code points written to the destination buffer */ + public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) { + if (srcLen < 0) { + throw new IllegalArgumentException("srcLen must be >= 0"); + } + int codePointCount = 0; + for (int i = 0; i < srcLen; ) { + final int cp = codePointAt(src, srcOff + i, srcOff + srcLen); + final int charCount = Character.charCount(cp); + dest[destOff + codePointCount++] = cp; + i += charCount; + } + return codePointCount; + } + + /** Converts a sequence of unicode code points to a sequence of Java characters. + * @return the number of chars written to the destination buffer */ + public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) { + if (srcLen < 0) { + throw new IllegalArgumentException("srcLen must be >= 0"); + } + int written = 0; + for (int i = 0; i < srcLen; ++i) { + written += Character.toChars(src[srcOff + i], dest, destOff + written); + } + return written; + } + + /** + * Fills the {@link CharacterBuffer} with characters read from the given + * reader {@link Reader}. This method tries to read <code>numChars</code> + * characters into the {@link CharacterBuffer}, each call to fill will start + * filling the buffer from offset <code>0</code> up to <code>numChars</code>. + * In case code points can span across 2 java characters, this method may + * only fill <code>numChars - 1</code> characters in order not to split in + * the middle of a surrogate pair, even if there are remaining characters in + * the {@link Reader}. + * <p> + * This method guarantees + * that the given {@link CharacterBuffer} will never contain a high surrogate + * character as the last element in the buffer unless it is the last available + * character in the reader. In other words, high and low surrogate pairs will + * always be preserved across buffer boarders. + * </p> + * <p> + * A return value of <code>false</code> means that this method call exhausted + * the reader, but there may be some bytes which have been read, which can be + * verified by checking whether <code>buffer.getLength() > 0</code>. + * </p> + * + * @param buffer + * the buffer to fill. + * @param reader + * the reader to read characters from. + * @param numChars + * the number of chars to read + * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer + * @throws IOException + * if the reader throws an {@link IOException}. + */ + public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException; + + /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */ + public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException { + return fill(buffer, reader, buffer.buffer.length); + } + + /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code> + * code points from <code>index</code>. */ + public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset); + + static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException { + int read = 0; + while (read < len) { + final int r = reader.read(dest, offset + read, len - read); + if (r == -1) { + break; + } + read += r; + } + return read; + } + + private static final class Java5CharacterUtils extends CharacterUtils { + Java5CharacterUtils() { + } + + @Override + public int codePointAt(final CharSequence seq, final int offset) { + return Character.codePointAt(seq, offset); + } + + @Override + public int codePointAt(final char[] chars, final int offset, final int limit) { + return Character.codePointAt(chars, offset, limit); + } + + @Override + public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException { + assert buffer.buffer.length >= 2; + if (numChars < 2 || numChars > buffer.buffer.length) { + throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size"); + } + final char[] charBuffer = buffer.buffer; + buffer.offset = 0; + final int offset; + + // Install the previously saved ending high surrogate: + if (buffer.lastTrailingHighSurrogate != 0) { + charBuffer[0] = buffer.lastTrailingHighSurrogate; + buffer.lastTrailingHighSurrogate = 0; + offset = 1; + } else { + offset = 0; + } + + final int read = readFully(reader, charBuffer, offset, numChars - offset); + + buffer.length = offset + read; + final boolean result = buffer.length == numChars; + if (buffer.length < numChars) { + // We failed to fill the buffer. Even if the last char is a high + // surrogate, there is nothing we can do + return result; + } + + if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) { + buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; + } + return result; + } + + @Override + public int codePointCount(CharSequence seq) { + return Character.codePointCount(seq, 0, seq.length()); + } + + @Override + public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) { + return Character.offsetByCodePoints(buf, start, count, index, offset); + } + } + + private static final class Java4CharacterUtils extends CharacterUtils { + Java4CharacterUtils() { + } + + @Override + public int codePointAt(final CharSequence seq, final int offset) { + return seq.charAt(offset); + } + + @Override + public int codePointAt(final char[] chars, final int offset, final int limit) { + if(offset >= limit) + throw new IndexOutOfBoundsException("offset must be less than limit"); + return chars[offset]; + } + + @Override + public boolean fill(CharacterBuffer buffer, Reader reader, int numChars) + throws IOException { + assert buffer.buffer.length >= 1; + if (numChars < 1 || numChars > buffer.buffer.length) { + throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size"); + } + buffer.offset = 0; + final int read = readFully(reader, buffer.buffer, 0, numChars); + buffer.length = read; + buffer.lastTrailingHighSurrogate = 0; + return read == numChars; + } + + @Override + public int codePointCount(CharSequence seq) { + return seq.length(); + } + + @Override + public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) { + final int result = index + offset; + if (result < 0 || result > count) { + throw new IndexOutOfBoundsException(); + } + return result; + } + + } + + /** + * A simple IO buffer to use with + * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. + */ + public static final class CharacterBuffer { + + private final char[] buffer; + private int offset; + private int length; + // NOTE: not private so outer class can access without + // $access methods: + char lastTrailingHighSurrogate; + + CharacterBuffer(char[] buffer, int offset, int length) { + this.buffer = buffer; + this.offset = offset; + this.length = length; + } + + /** + * Returns the internal buffer + * + * @return the buffer + */ + public char[] getBuffer() { + return buffer; + } + + /** + * Returns the data offset in the internal buffer. + * + * @return the offset + */ + public int getOffset() { + return offset; + } + + /** + * Return the length of the data in the internal buffer starting at + * {@link #getOffset()} + * + * @return the length + */ + public int getLength() { + return length; + } + + /** + * Resets the CharacterBuffer. All internals are reset to its default + * values. + */ + public void reset() { + offset = 0; + length = 0; + lastTrailingHighSurrogate = 0; + } + } + +} |