// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.process; import java.util.*; /** * A class which splits consecutive word character sequences into overlapping character n-grams. * For example "en gul bille sang" split into 2-grams becomes * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang". *

* This class is multithread safe. * * @author bratseth */ public class GramSplitter { private final CharacterClasses characterClasses; public GramSplitter(CharacterClasses characterClasses) { this.characterClasses = characterClasses; } /** * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length] * pairs into the input string. *

* The iterator is implemented as a sliding view over the input string rather than being backed by a * list, which makes this space efficient for large strings. * * @param input the input string to be split, cannot be null * @param n the gram size, a positive integer * @return a read only iterator over the resulting grams * @throws NullPointerException if input==null * @throws IllegalArgumentException if n is less than 1 */ public GramSplitterIterator split(String input, int n) { if (input == null) { throw new NullPointerException("input cannot be null"); } if (n < 1) { throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n); } return new GramSplitterIterator(input, n, characterClasses); } public static class GramSplitterIterator implements Iterator { private final CharacterClasses characterClasses; /** * Text to split */ private final String input; /** * Gram size */ private final int n; /** * Current index */ private int i = 0; /** * Whether the last thing that happened was being on a separator (including the start of the string) */ private boolean isFirstAfterSeparator = true; /** * The next gram or null if not determined yet */ private Gram nextGram = null; public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) { this.input = input; this.n = n; this.characterClasses = characterClasses; } @Override public boolean hasNext() { if (nextGram != null) { return true; } nextGram = findNext(); return nextGram != null; } @Override public Gram next() { Gram currentGram = nextGram; if (currentGram == null) { currentGram = findNext(); } if (currentGram == null) { throw new NoSuchElementException("No next gram at position " + i); } nextGram = null; return currentGram; } private Gram findNext() { // Skip to next word character while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) { i++; isFirstAfterSeparator = true; } if (i >= input.length()) { return null; } String gram = input.substring(i, Math.min(i + n, input.length())); int nonWordChar = indexOfNonWordChar(gram); if (nonWordChar == 0) { throw new RuntimeException("Programming error"); } if (nonWordChar > 0) { gram = gram.substring(0, nonWordChar); } if (gram.length() == n) { // normal case: got a full length gram i++; isFirstAfterSeparator = false; return new Gram(i - 1, gram.length()); } else { // gram is too short due either to a non-word separator or end of string if (isFirstAfterSeparator) { // make a gram anyway i++; isFirstAfterSeparator = false; return new Gram(i - 1, gram.length()); } else { // skip to next i += gram.length() + 1; isFirstAfterSeparator = true; return findNext(); } } } private int indexOfNonWordChar(String s) { for (int i = 0; i < s.length(); i++) { if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) { return i; } } return -1; } @Override public void remove() { throw new UnsupportedOperationException("This iterator is read only"); } /** * Convenience list which splits the remaining items in this iterator into a list of gram strings * * @return an immutable list of extracted grams */ public List toExtractedList() { List gramList = new ArrayList<>(); while (hasNext()) { gramList.add(next().extractFrom(input)); } return Collections.unmodifiableList(gramList); } } /** * An immutable start index and length pair */ public static final class Gram { private int start, length; public Gram(int start, int length) { this.start = start; this.length = length; } public int getStart() { return start; } public int getLength() { return length; } /** * Returns this gram as a string from the input string */ public String extractFrom(String input) { return input.substring(start, start + length); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof Gram)) { return false; } Gram gram = (Gram)o; if (length != gram.length) { return false; } if (start != gram.start) { return false; } return true; } @Override public int hashCode() { int result = start; result = 31 * result + length; return result; } } }