summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2020-06-12 12:51:22 +0200
committerJon Bratseth <bratseth@gmail.com>2020-06-12 12:51:22 +0200
commit7f7b6777514bf05916e2edcbc3e27b1bfd28906c (patch)
treec530cbc56b80eb5128d2d9254b92c0486923f0d4 /linguistics
parent9fc05281d6a79c26efe04edeb7604300f0c05845 (diff)
SpareCapacityMaintainer sketch
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java1
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java83
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Normalizer.java8
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Transformer.java4
6 files changed, 35 insertions, 66 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index d3f6fcf2ee3..93599fa7dbe 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -15,6 +15,7 @@ import java.util.logging.Logger;
import java.util.logging.Level;
public class OpenNlpTokenizer implements Tokenizer {
+
private final static int SPACE_CODE = 32;
private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName());
private final Normalizer normalizer;
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
index ce0291c85e5..59ae664e79e 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -17,7 +17,7 @@ public class CharacterClasses {
if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
// if (c == '_') return true;
- // Ticket 3864695, some CJK punctuation YST defined as word characters
+ // Some CJK punctuation defined as word characters
if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
c == '\u300c' || c == '\u300d' || c == '\u300e' ||
c == '\u300f' || c == '\u3010' || c == '\u3011') {
@@ -52,4 +52,5 @@ public class CharacterClasses {
public boolean isLetterOrDigit(int c) {
return isLetter(c) || isDigit(c);
}
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index 94fd0e08493..aa7ae59edf9 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -39,12 +39,8 @@ public class GramSplitter {
* @throws IllegalArgumentException if n is less than 1
*/
public GramSplitterIterator split(String input, int n) {
- if (input == null) {
- throw new NullPointerException("input cannot be null");
- }
- if (n < 1) {
- throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n);
- }
+ if (input == null) throw new NullPointerException("input cannot be null");
+ if (n < 1) throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n);
return new GramSplitterIterator(input, n, characterClasses);
}
@@ -52,29 +48,19 @@ public class GramSplitter {
private final CharacterClasses characterClasses;
- /**
- * Text to split
- */
+ /** Text to split */
private final String input;
- /**
- * Gram size
- */
+ /** Gram size */
private final int n;
- /**
- * Current index
- */
+ /** Current index */
private int i = 0;
- /**
- * Whether the last thing that happened was being on a separator (including the start of the string)
- */
+ /** Whether the last thing that happened was being on a separator (including the start of the string) */
private boolean isFirstAfterSeparator = true;
- /**
- * The next gram or null if not determined yet
- */
+ /** The next gram or null if not determined yet */
private Gram nextGram = null;
public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) {
@@ -85,9 +71,7 @@ public class GramSplitter {
@Override
public boolean hasNext() {
- if (nextGram != null) {
- return true;
- }
+ if (nextGram != null) return true;
nextGram = findNext();
return nextGram != null;
}
@@ -95,12 +79,10 @@ public class GramSplitter {
@Override
public Gram next() {
Gram currentGram = nextGram;
- if (currentGram == null) {
+ if (currentGram == null)
currentGram = findNext();
- }
- if (currentGram == null) {
+ if (currentGram == null)
throw new NoSuchElementException("No next gram at position " + i);
- }
nextGram = null;
return currentGram;
}
@@ -111,24 +93,21 @@ public class GramSplitter {
i++;
isFirstAfterSeparator = true;
}
- if (i >= input.length()) {
- return null;
- }
+ if (i >= input.length()) return null;
String gram = input.substring(i, Math.min(i + n, input.length()));
int nonWordChar = indexOfNonWordChar(gram);
- if (nonWordChar == 0) {
- throw new RuntimeException("Programming error");
- }
- if (nonWordChar > 0) {
+ if (nonWordChar == 0) throw new RuntimeException("Programming error");
+
+ if (nonWordChar > 0)
gram = gram.substring(0, nonWordChar);
- }
if (gram.length() == n) { // normal case: got a full length gram
i++;
isFirstAfterSeparator = false;
return new Gram(i - 1, gram.length());
- } else { // gram is too short due either to a non-word separator or end of string
+ }
+ else { // gram is too short due either to a non-word separator or end of string
if (isFirstAfterSeparator) { // make a gram anyway
i++;
isFirstAfterSeparator = false;
@@ -143,9 +122,8 @@ public class GramSplitter {
private int indexOfNonWordChar(String s) {
for (int i = 0; i < s.length(); i++) {
- if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) {
+ if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i)))
return i;
- }
}
return -1;
}
@@ -162,9 +140,8 @@ public class GramSplitter {
*/
public List<String> toExtractedList() {
List<String> gramList = new ArrayList<>();
- while (hasNext()) {
+ while (hasNext())
gramList.add(next().extractFrom(input));
- }
return Collections.unmodifiableList(gramList);
}
}
@@ -189,31 +166,19 @@ public class GramSplitter {
return length;
}
- /**
- * Returns this gram as a string from the input string
- */
+ /** Returns this gram as a string from the input string */
public String extractFrom(String input) {
return input.substring(start, start + length);
}
@Override
public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (!(o instanceof Gram)) {
- return false;
- }
+ if (this == o) return true;
+ if ( ! (o instanceof Gram)) return false;
Gram gram = (Gram)o;
-
- if (length != gram.length) {
- return false;
- }
- if (start != gram.start) {
- return false;
- }
-
+ if (length != gram.length) return false;
+ if (start != gram.start) return false;
return true;
}
@@ -223,5 +188,7 @@ public class GramSplitter {
result = 31 * result + length;
return result;
}
+
}
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
index 0e34f88f4ca..044d249f077 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
@@ -9,11 +9,11 @@ package com.yahoo.language.process;
public interface Normalizer {
/**
- * <p>NFKC normalizes a String.</p>
+ * NFKC normalizes a String.
*
- * @param input String to normalize.
- * @return The normalized String.
- * @throws ProcessingException If underlying library throws an Exception.
+ * @param input the string to normalize
+ * @return the normalized string
+ * @throws ProcessingException if underlying library throws an Exception
*/
String normalize(String input);
diff --git a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
index 941afa07347..752992f5a26 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
@@ -2,7 +2,7 @@
package com.yahoo.language.process;
/**
- * <p>Exception class indicating that a fatal error occured during linguistic processing.</p>
+ * Exception class indicating that a fatal error occured during linguistic processing.
*
* @author Simon Thoresen Hult
*/
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
index 46f3c060d4e..4927edc98c9 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
@@ -13,8 +13,8 @@ public interface Transformer {
/**
* Remove accents from input text.
*
- * @param input text to transform.
- * @param language language of input text.
+ * @param input text to transform
+ * @param language language of input text
* @return text with accents removed, or input-text if the feature is unavailable
* @throws ProcessingException thrown if there is an exception stemming this input
*/