diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java new file mode 100644 index 00000000000..0672582d732 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -0,0 +1,222 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import java.util.*; + +/** + * A class which splits consecutive word character sequences into overlapping character n-grams. + * For example "en gul bille sang" split into 2-grams becomes + * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang". + * <p> + * This class is multithread safe. + * + * @author bratseth + */ +public class GramSplitter { + + private final CharacterClasses characterClasses; + + public GramSplitter(CharacterClasses characterClasses) { + this.characterClasses = characterClasses; + } + + /** + * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length] + * pairs into the input string. + * <p> + * The iterator is implemented as a sliding view over the input string rather than being backed by a + * list, which makes this space efficient for large strings. + * + * @param input the input string to be split, cannot be null + * @param n the gram size, a positive integer + * @return a read only iterator over the resulting grams + * @throws NullPointerException if input==null + * @throws IllegalArgumentException if n is less than 1 + */ + public GramSplitterIterator split(String input, int n) { + if (input == null) { + throw new NullPointerException("input cannot be null"); + } + if (n < 1) { + throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n); + } + return new GramSplitterIterator(input, n, characterClasses); + } + + public static class GramSplitterIterator implements Iterator<Gram> { + + private final CharacterClasses characterClasses; + + /** + * Text to split + */ + private final String input; + + /** + * Gram size + */ + private final int n; + + /** + * Current index + */ + private int i = 0; + + /** + * Whether the last thing that happened was being on a separator (including the start of the string) + */ + private boolean isFirstAfterSeparator = true; + + /** + * The next gram or null if not determined yet + */ + private Gram nextGram = null; + + public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) { + this.input = input; + this.n = n; + this.characterClasses = characterClasses; + } + + @Override + public boolean hasNext() { + if (nextGram != null) { + return true; + } + nextGram = findNext(); + return nextGram != null; + } + + @Override + public Gram next() { + Gram currentGram = nextGram; + if (currentGram == null) { + currentGram = findNext(); + } + if (currentGram == null) { + throw new NoSuchElementException("No next gram at position " + i); + } + nextGram = null; + return currentGram; + } + + private Gram findNext() { + // Skip to next word character + while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) { + i++; + isFirstAfterSeparator = true; + } + if (i >= input.length()) { + return null; + } + + String gram = input.substring(i, Math.min(i + n, input.length())); + int nonWordChar = indexOfNonWordChar(gram); + if (nonWordChar == 0) { + throw new RuntimeException("Programming error"); + } + if (nonWordChar > 0) { + gram = gram.substring(0, nonWordChar); + } + + if (gram.length() == n) { // normal case: got a full length gram + i++; + isFirstAfterSeparator = false; + return new Gram(i - 1, gram.length()); + } else { // gram is too short due either to a non-word separator or end of string + if (isFirstAfterSeparator) { // make a gram anyway + i++; + isFirstAfterSeparator = false; + return new Gram(i - 1, gram.length()); + } else { // skip to next + i += gram.length() + 1; + isFirstAfterSeparator = true; + return findNext(); + } + } + } + + private int indexOfNonWordChar(String s) { + for (int i = 0; i < s.length(); i++) { + if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) { + return i; + } + } + return -1; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("This iterator is read only"); + } + + /** + * Convenience list which splits the remaining items in this iterator into a list of gram strings + * + * @return an immutable list of extracted grams + */ + public List<String> toExtractedList() { + List<String> gramList = new ArrayList<>(); + while (hasNext()) { + gramList.add(next().extractFrom(input)); + } + return Collections.unmodifiableList(gramList); + } + } + + /** + * An immutable start index and length pair + */ + public static final class Gram { + + private int start, length; + + public Gram(int start, int length) { + this.start = start; + this.length = length; + } + + public int getStart() { + return start; + } + + public int getLength() { + return length; + } + + /** + * Returns this gram as a string from the input string + */ + public String extractFrom(String input) { + return input.substring(start, start + length); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof Gram)) { + return false; + } + + Gram gram = (Gram)o; + + if (length != gram.length) { + return false; + } + if (start != gram.start) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = start; + result = 31 * result + length; + return result; + } + } +} |