From 7f9ed6ede10ba75023d640e38108352ee0c36a37 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Tue, 21 Sep 2021 17:45:50 +0200 Subject: Linguistics cleanup --- linguistics/abi-spec.json | 11 ----------- .../src/main/java/com/yahoo/language/Linguistics.java | 1 + .../main/java/com/yahoo/language/detect/Detection.java | 1 + .../com/yahoo/language/detect/DetectionException.java | 3 ++- .../src/main/java/com/yahoo/language/detect/Hint.java | 5 +++-- .../com/yahoo/language/opennlp/OpenNlpLinguistics.java | 1 + .../com/yahoo/language/opennlp/OpenNlpTokenizer.java | 3 +-- .../com/yahoo/language/opennlp/OptimaizeDetector.java | 10 +++++----- .../com/yahoo/language/process/ProcessingException.java | 1 + .../main/java/com/yahoo/language/process/StemList.java | 4 +++- .../main/java/com/yahoo/language/process/StemMode.java | 16 +++++----------- .../main/java/com/yahoo/language/process/Stemmer.java | 2 +- .../java/com/yahoo/language/process/StemmerImpl.java | 1 + .../java/com/yahoo/language/process/TokenScript.java | 2 +- .../main/java/com/yahoo/language/sentencepiece/Trie.java | 2 +- .../java/com/yahoo/language/simple/SimpleDetector.java | 9 --------- .../com/yahoo/language/simple/SimpleLinguistics.java | 1 + .../java/com/yahoo/language/simple/SimpleTokenType.java | 1 + 18 files changed, 29 insertions(+), 45 deletions(-) diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index dc7450678c5..dbf4842ea1a 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -918,16 +918,5 @@ "public java.lang.String normalize(java.lang.String)" ], "fields": [] - }, - "com.yahoo.language.sentencepiece.Trie": { - "superClass": "java.lang.Object", - "interfaces": [], - "attributes": [ - "public" - ], - "methods": [ - "public void ()" - ], - "fields": [] } } \ No newline at end of file diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java index 64ef8762be8..8af0fcd42cb 100644 --- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java @@ -88,4 +88,5 @@ public interface Linguistics { /** Check if another instance is equivalent to this one */ boolean equals(Linguistics other); + } diff --git a/linguistics/src/main/java/com/yahoo/language/detect/Detection.java b/linguistics/src/main/java/com/yahoo/language/detect/Detection.java index 4b816335154..127777db4d2 100644 --- a/linguistics/src/main/java/com/yahoo/language/detect/Detection.java +++ b/linguistics/src/main/java/com/yahoo/language/detect/Detection.java @@ -44,4 +44,5 @@ public class Detection { public boolean isLocal() { return local; } + } diff --git a/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java b/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java index a43dc0cb537..5fceabefae3 100644 --- a/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java +++ b/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java @@ -4,11 +4,12 @@ package com.yahoo.language.detect; /** * Exception that is thrown when detection fails. * - * @author Einar M R Rosenvinge + * @author Einar M R Rosenvinge */ public final class DetectionException extends RuntimeException { public DetectionException(String str) { super(str); } + } diff --git a/linguistics/src/main/java/com/yahoo/language/detect/Hint.java b/linguistics/src/main/java/com/yahoo/language/detect/Hint.java index 50291c922e8..b6bf4403cf3 100644 --- a/linguistics/src/main/java/com/yahoo/language/detect/Hint.java +++ b/linguistics/src/main/java/com/yahoo/language/detect/Hint.java @@ -2,9 +2,9 @@ package com.yahoo.language.detect; /** - *

A hint that can be given to a {@link Detector}.

+ * A hint that can be given to a {@link Detector}. * - * @author Einar M R Rosenvinge + * @author Einar M R Rosenvinge */ public class Hint { @@ -35,4 +35,5 @@ public class Hint { public static Hint newInstance(String market, String country) { return new Hint(market, country); } + } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 64888dba183..0edd48f5ee3 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -49,4 +49,5 @@ public class OpenNlpLinguistics extends SimpleLinguistics { @Override public boolean equals(Linguistics other) { return (other instanceof OpenNlpLinguistics); } + } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 73518876c3f..603905bead8 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -19,7 +19,6 @@ import opennlp.tools.stemmer.Stemmer; import opennlp.tools.stemmer.snowball.SnowballStemmer; import java.util.ArrayList; -import java.util.Collections; import java.util.List; /** @@ -52,7 +51,7 @@ public class OpenNlpTokenizer implements Tokenizer { @Override public Iterable tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { - if (input.isEmpty()) return Collections.emptyList(); + if (input.isEmpty()) return List.of(); Stemmer stemmer = stemmerFor(language, stemMode); if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java index bf07c91ba44..9bf1281e015 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java @@ -32,10 +32,10 @@ import java.util.logging.Level; */ public class OptimaizeDetector implements Detector { - static private Object initGuard = new Object(); - static private TextObjectFactory textObjectFactory = null; - static private LanguageDetector languageDetector = null; - static private final Logger log = Logger.getLogger(OptimaizeDetector.class.getName()); + private static final Object initGuard = new Object(); + private static TextObjectFactory textObjectFactory = null; + private static LanguageDetector languageDetector = null; + private static final Logger log = Logger.getLogger(OptimaizeDetector.class.getName()); static private void initOptimaize() { synchronized (initGuard) { @@ -60,7 +60,7 @@ public class OptimaizeDetector implements Detector { } } - private SimpleDetector simpleDetector = new SimpleDetector(); + private final SimpleDetector simpleDetector = new SimpleDetector(); public OptimaizeDetector() { initOptimaize(); diff --git a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java index 752992f5a26..99576240635 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java +++ b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java @@ -15,4 +15,5 @@ public class ProcessingException extends RuntimeException { public ProcessingException(String message, Throwable cause) { super(message, cause); } + } diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemList.java b/linguistics/src/main/java/com/yahoo/language/process/StemList.java index a38a2e51cb6..d5451e7660d 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/StemList.java +++ b/linguistics/src/main/java/com/yahoo/language/process/StemList.java @@ -3,6 +3,7 @@ package com.yahoo.language.process; import java.util.AbstractList; import java.util.ArrayList; +import java.util.List; /** * A list of strings which does not allow for duplicate elements. @@ -10,7 +11,8 @@ import java.util.ArrayList; * @author steinar */ public class StemList extends AbstractList { - private final ArrayList stems; + + private final List stems; public StemList() { this(new String[0]); diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java index 628f6910c9e..4adb5de62da 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java +++ b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java @@ -10,16 +10,10 @@ package com.yahoo.language.process; */ public enum StemMode { - NONE(0), - DEFAULT(1), - ALL(2), - SHORTEST(4), - BEST(5); - - private final int value; - - StemMode(int value) { - this.value = value; - } + NONE, + DEFAULT, + ALL, + SHORTEST, + BEST; } diff --git a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java index a2d0d0a84c9..1c6180c1f59 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java @@ -18,7 +18,7 @@ public interface Stemmer { * @param input the string to stem. * @param mode the stemming mode * @param language the language to use for stemming - * @return list of possible stems. Empty if none. + * @return a list of possible stems. Empty if none. * @throws ProcessingException thrown if there is an exception stemming this input */ List stem(String input, StemMode mode, Language language); diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java index f401ddaba99..dd830570e88 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java +++ b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java @@ -43,4 +43,5 @@ public class StemmerImpl implements Stemmer { } } } + } diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java index efe4073d97e..ff87b9b128b 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java +++ b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java @@ -5,7 +5,7 @@ package com.yahoo.language.process; * List of token scripts (e.g. latin, japanese, chinese, etc.) which may warrant different * linguistics treatment. * - * @author Mathias Mølster Lidal + * @author Mathias Mølster Lidal */ public enum TokenScript { diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/Trie.java b/linguistics/src/main/java/com/yahoo/language/sentencepiece/Trie.java index 9abed89e7a2..8e7c2db2ed3 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/Trie.java +++ b/linguistics/src/main/java/com/yahoo/language/sentencepiece/Trie.java @@ -9,7 +9,7 @@ import java.util.Map; * * @author bratseth */ -public class Trie { +class Trie { final Node root = new Node(); diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index 3de0eb3e997..e15c6257414 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -70,15 +70,6 @@ public class SimpleDetector implements Detector { block == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) { return Language.KOREAN; } - // katakana phonetic extensions. - if (0x31f0 <= c && c <= 0x31ff) { - // See http://www.unicode.org/charts/PDF/U31F0.pdf - // This is a special case because This range of character - // codes is classified as unasigned in - // Character.UnicodeBlock. But clearly it is assigned as - // per above. - return Language.JAPANESE; - } if (0x31f0 <= c && c <= 0x31ff || // these are standard character blocks for japanese characters. block == Character.UnicodeBlock.HIRAGANA || block == Character.UnicodeBlock.KATAKANA || diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index 026bc8add25..b319c343510 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -72,4 +72,5 @@ public class SimpleLinguistics implements Linguistics { @Override public boolean equals(Linguistics other) { return (other instanceof SimpleLinguistics); } + } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java index d7eb8a72ed8..b5c11b13c67 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java @@ -65,4 +65,5 @@ public class SimpleTokenType { } throw new UnsupportedOperationException(String.valueOf(Character.getType(codePoint))); } + } -- cgit v1.2.3