diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-06-12 12:51:22 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-06-12 12:51:22 +0200 |
commit | 7f7b6777514bf05916e2edcbc3e27b1bfd28906c (patch) | |
tree | c530cbc56b80eb5128d2d9254b92c0486923f0d4 /linguistics/src/main/java | |
parent | 9fc05281d6a79c26efe04edeb7604300f0c05845 (diff) |
SpareCapacityMaintainer sketch
Diffstat (limited to 'linguistics/src/main/java')
6 files changed, 35 insertions, 66 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index d3f6fcf2ee3..93599fa7dbe 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -15,6 +15,7 @@ import java.util.logging.Logger; import java.util.logging.Level; public class OpenNlpTokenizer implements Tokenizer { + private final static int SPACE_CODE = 32; private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName()); private final Normalizer normalizer; diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java index ce0291c85e5..59ae664e79e 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java +++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java @@ -17,7 +17,7 @@ public class CharacterClasses { if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters // if (c == '_') return true; - // Ticket 3864695, some CJK punctuation YST defined as word characters + // Some CJK punctuation defined as word characters if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' || c == '\u300c' || c == '\u300d' || c == '\u300e' || c == '\u300f' || c == '\u3010' || c == '\u3011') { @@ -52,4 +52,5 @@ public class CharacterClasses { public boolean isLetterOrDigit(int c) { return isLetter(c) || isDigit(c); } + } diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java index 94fd0e08493..aa7ae59edf9 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -39,12 +39,8 @@ public class GramSplitter { * @throws IllegalArgumentException if n is less than 1 */ public GramSplitterIterator split(String input, int n) { - if (input == null) { - throw new NullPointerException("input cannot be null"); - } - if (n < 1) { - throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n); - } + if (input == null) throw new NullPointerException("input cannot be null"); + if (n < 1) throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n); return new GramSplitterIterator(input, n, characterClasses); } @@ -52,29 +48,19 @@ public class GramSplitter { private final CharacterClasses characterClasses; - /** - * Text to split - */ + /** Text to split */ private final String input; - /** - * Gram size - */ + /** Gram size */ private final int n; - /** - * Current index - */ + /** Current index */ private int i = 0; - /** - * Whether the last thing that happened was being on a separator (including the start of the string) - */ + /** Whether the last thing that happened was being on a separator (including the start of the string) */ private boolean isFirstAfterSeparator = true; - /** - * The next gram or null if not determined yet - */ + /** The next gram or null if not determined yet */ private Gram nextGram = null; public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) { @@ -85,9 +71,7 @@ public class GramSplitter { @Override public boolean hasNext() { - if (nextGram != null) { - return true; - } + if (nextGram != null) return true; nextGram = findNext(); return nextGram != null; } @@ -95,12 +79,10 @@ public class GramSplitter { @Override public Gram next() { Gram currentGram = nextGram; - if (currentGram == null) { + if (currentGram == null) currentGram = findNext(); - } - if (currentGram == null) { + if (currentGram == null) throw new NoSuchElementException("No next gram at position " + i); - } nextGram = null; return currentGram; } @@ -111,24 +93,21 @@ public class GramSplitter { i++; isFirstAfterSeparator = true; } - if (i >= input.length()) { - return null; - } + if (i >= input.length()) return null; String gram = input.substring(i, Math.min(i + n, input.length())); int nonWordChar = indexOfNonWordChar(gram); - if (nonWordChar == 0) { - throw new RuntimeException("Programming error"); - } - if (nonWordChar > 0) { + if (nonWordChar == 0) throw new RuntimeException("Programming error"); + + if (nonWordChar > 0) gram = gram.substring(0, nonWordChar); - } if (gram.length() == n) { // normal case: got a full length gram i++; isFirstAfterSeparator = false; return new Gram(i - 1, gram.length()); - } else { // gram is too short due either to a non-word separator or end of string + } + else { // gram is too short due either to a non-word separator or end of string if (isFirstAfterSeparator) { // make a gram anyway i++; isFirstAfterSeparator = false; @@ -143,9 +122,8 @@ public class GramSplitter { private int indexOfNonWordChar(String s) { for (int i = 0; i < s.length(); i++) { - if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) { + if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i))) return i; - } } return -1; } @@ -162,9 +140,8 @@ public class GramSplitter { */ public List<String> toExtractedList() { List<String> gramList = new ArrayList<>(); - while (hasNext()) { + while (hasNext()) gramList.add(next().extractFrom(input)); - } return Collections.unmodifiableList(gramList); } } @@ -189,31 +166,19 @@ public class GramSplitter { return length; } - /** - * Returns this gram as a string from the input string - */ + /** Returns this gram as a string from the input string */ public String extractFrom(String input) { return input.substring(start, start + length); } @Override public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof Gram)) { - return false; - } + if (this == o) return true; + if ( ! (o instanceof Gram)) return false; Gram gram = (Gram)o; - - if (length != gram.length) { - return false; - } - if (start != gram.start) { - return false; - } - + if (length != gram.length) return false; + if (start != gram.start) return false; return true; } @@ -223,5 +188,7 @@ public class GramSplitter { result = 31 * result + length; return result; } + } + } diff --git a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java index 0e34f88f4ca..044d249f077 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java @@ -9,11 +9,11 @@ package com.yahoo.language.process; public interface Normalizer { /** - * <p>NFKC normalizes a String.</p> + * NFKC normalizes a String. * - * @param input String to normalize. - * @return The normalized String. - * @throws ProcessingException If underlying library throws an Exception. + * @param input the string to normalize + * @return the normalized string + * @throws ProcessingException if underlying library throws an Exception */ String normalize(String input); diff --git a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java index 941afa07347..752992f5a26 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java +++ b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java @@ -2,7 +2,7 @@ package com.yahoo.language.process; /** - * <p>Exception class indicating that a fatal error occured during linguistic processing.</p> + * Exception class indicating that a fatal error occured during linguistic processing. * * @author Simon Thoresen Hult */ diff --git a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java index 46f3c060d4e..4927edc98c9 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java @@ -13,8 +13,8 @@ public interface Transformer { /** * Remove accents from input text. * - * @param input text to transform. - * @param language language of input text. + * @param input text to transform + * @param language language of input text * @return text with accents removed, or input-text if the feature is unavailable * @throws ProcessingException thrown if there is an exception stemming this input */ |