diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-04-14 10:08:30 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-04-14 10:08:30 +0200 |
commit | 9ec6d6986ae64496cedc5a23fe2ddb8447eabcd4 (patch) | |
tree | c270c9ba65a121a87deb877510ba527729f20876 | |
parent | fd9b726786f4c00b276f2d84fd0a3593a0c406eb (diff) |
No functional changes
13 files changed, 84 insertions, 128 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java index 7a3f5fa4055..174d16fbd67 100644 --- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java +++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java @@ -14,13 +14,14 @@ import java.util.Locale; public class LinguisticsCase { /** - * <p>The lower casing method to use in Vespa when doing language independent processing of natural language data. - * It is placed in a single place to ensure symmetry between e.g. query processing and indexing.</p> - * <p>Return a lowercased version of the given string. Since this is language independent, this is more of a case - * normalization operation than lowercasing.</p> + * The lower casing method to use in Vespa when doing language independent processing of natural language data. + * It is placed in a single place to ensure symmetry between e.g. query processing and indexing. * - * @param in The string to lowercase. - * @return A string containing only lowercase character. + * Return a lowercased version of the given string. Since this is language independent, this is more of a case + * normalization operation than lowercasing. + * + * @param in the string to lowercase + * @return a string containing only lowercase characters */ public static String toLowerCase(String in) { // def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29 diff --git a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java index 2760f9e673e..05b57937625 100644 --- a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java +++ b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java @@ -2,6 +2,7 @@ package com.yahoo.language; import java.util.Locale; +import java.util.Objects; /** * @author Simon Thoresen Hult @@ -10,25 +11,20 @@ public final class LocaleFactory { private static final Locale UNKNOWN = new Locale("", "", ""); - private LocaleFactory() { - // hide - } + private LocaleFactory() {} /** * Implements a simple parser for RFC5646 language tags. The language tag is parsed into a Locale. * - * @param tag The language tag to parse. - * @return The corresponding Locale. + * @param tag the language tag to parse + * @return the corresponding Locale */ - @SuppressWarnings("ConstantConditions") public static Locale fromLanguageTag(String tag) { - // TODO: Should be replaced by return Locale.forLanguageTag(tag); ? + Objects.requireNonNull(tag, "tag cannot be null"); - tag.getClass(); // throws NullPointerException tag = tag.trim(); - if (tag.isEmpty()) { - return UNKNOWN; - } + if (tag.isEmpty()) return UNKNOWN; + String language = ""; String region = ""; String script = ""; @@ -48,9 +44,7 @@ public final class LocaleFactory { } } } - if (language.isEmpty()) { - return UNKNOWN; - } + if (language.isEmpty()) return UNKNOWN; return new Locale(language, region, script); } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 0837b25c151..a5f77fca0af 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -12,6 +12,8 @@ import java.util.logging.Level; /** * Returns a linguistics implementation based on OpenNlp, * and (optionally, default on) Optimaize for language detection. + * + * @author bratseth */ public class OpenNlpLinguistics extends SimpleLinguistics { diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 93599fa7dbe..e1185cb2457 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -3,21 +3,32 @@ package com.yahoo.language.opennlp; import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; -import com.yahoo.language.process.*; -import com.yahoo.language.simple.*; +import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.process.Transformer; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleToken; +import com.yahoo.language.simple.SimpleTokenType; +import com.yahoo.language.simple.SimpleTokenizer; +import com.yahoo.language.simple.SimpleTransformer; import opennlp.tools.stemmer.Stemmer; import opennlp.tools.stemmer.snowball.SnowballStemmer; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.logging.Logger; -import java.util.logging.Level; +/** + * Tokenizer using OpenNlp + * + * @author matskin + */ public class OpenNlpTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; - private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName()); private final Normalizer normalizer; private final Transformer transformer; private final SimpleTokenizer simpleTokenizer; @@ -35,10 +46,8 @@ public class OpenNlpTokenizer implements Tokenizer { @Override public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); - Stemmer stemmer = getStemmerForLanguage(language, stemMode); - if (stemmer == null) { - return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); - } + Stemmer stemmer = stemmerFor(language, stemMode); + if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); @@ -49,9 +58,7 @@ public class OpenNlpTokenizer implements Tokenizer { if (!prevType.isIndexable() || !nextType.isIndexable()) { String original = input.substring(prev, next); String token = processToken(original, language, stemMode, removeAccents, stemmer); - tokens.add(new SimpleToken(original).setOffset(prev) - .setType(prevType) - .setTokenString(token)); + tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token)); prev = next; prevType = nextType; } @@ -60,89 +67,45 @@ public class OpenNlpTokenizer implements Tokenizer { return tokens; } - private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { - log.log(Level.FINEST, () -> "getStemmerForLanguage '"+language+"' mode: "+stemMode); - if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) { - return null; - } - SnowballStemmer.ALGORITHM alg; - switch (language) { - case DANISH: - alg = SnowballStemmer.ALGORITHM.DANISH; - break; - case DUTCH: - alg = SnowballStemmer.ALGORITHM.DUTCH; - break; - case FINNISH: - alg = SnowballStemmer.ALGORITHM.FINNISH; - break; - case FRENCH: - alg = SnowballStemmer.ALGORITHM.FRENCH; - break; - case GERMAN: - alg = SnowballStemmer.ALGORITHM.GERMAN; - break; - case HUNGARIAN: - alg = SnowballStemmer.ALGORITHM.HUNGARIAN; - break; - case IRISH: - alg = SnowballStemmer.ALGORITHM.IRISH; - break; - case ITALIAN: - alg = SnowballStemmer.ALGORITHM.ITALIAN; - break; - case NORWEGIAN_BOKMAL: - case NORWEGIAN_NYNORSK: - alg = SnowballStemmer.ALGORITHM.NORWEGIAN; - break; - case PORTUGUESE: - alg = SnowballStemmer.ALGORITHM.PORTUGUESE; - break; - case ROMANIAN: - alg = SnowballStemmer.ALGORITHM.ROMANIAN; - break; - case RUSSIAN: - alg = SnowballStemmer.ALGORITHM.RUSSIAN; - break; - case SPANISH: - alg = SnowballStemmer.ALGORITHM.SPANISH; - break; - case SWEDISH: - alg = SnowballStemmer.ALGORITHM.SWEDISH; - break; - case TURKISH: - alg = SnowballStemmer.ALGORITHM.TURKISH; - break; - case ENGLISH: - alg = SnowballStemmer.ALGORITHM.ENGLISH; - break; - default: - return null; - - } - return new SnowballStemmer(alg); - } - private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { - final String original = token; - log.log(Level.FINEST, () -> "processToken '"+original+"'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); - if (stemMode != StemMode.NONE) { - final String oldToken = token; - token = doStemming(token, stemmer); - final String newToken = token; - log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'"); - } - final String result = token; - log.log(Level.FINEST, () -> "processed token is: "+result); - return result; + if (stemMode != StemMode.NONE) + token = stemmer.stem(token).toString(); + return token; + } + + private Stemmer stemmerFor(Language language, StemMode stemMode) { + if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null; + SnowballStemmer.ALGORITHM algorithm = algorithmFor(language); + if (algorithm == null) return null; + return new SnowballStemmer(algorithm); } - private String doStemming(String token, Stemmer stemmer) { - return stemmer.stem(token).toString(); + private SnowballStemmer.ALGORITHM algorithmFor(Language language) { + switch (language) { + case DANISH: return SnowballStemmer.ALGORITHM.DANISH; + case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH; + case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH; + case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH; + case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN; + case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN; + case IRISH: return SnowballStemmer.ALGORITHM.IRISH; + case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN; + case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN; + case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN; + case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE; + case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN; + case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN; + case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH; + case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH; + case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH; + case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH; + default: return null; + } } + } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index 389926f1c1b..e1a04b2985d 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -33,7 +33,6 @@ public class SimpleLinguistics implements Linguistics { private final GramSplitter gramSplitter; @Inject - @SuppressWarnings("deprecation") public SimpleLinguistics() { this.normalizer = new SimpleNormalizer(); this.transformer = new SimpleTransformer(); diff --git a/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java index c0f1b92a6bf..f2891a0c5d5 100644 --- a/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java @@ -6,6 +6,7 @@ import org.junit.Test; import java.nio.ByteBuffer; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import static org.junit.Assert.*; @@ -15,7 +16,7 @@ import static org.junit.Assert.*; public class AbstractDetectorTestCase { private static final Detection DETECTION = new Detection(Language.ARABIC, "encoding", true); - private static final Charset UTF8 = Charset.forName("UTF-8"); + private static final Charset UTF8 = StandardCharsets.UTF_8; @Test public void requireThatDetectStringForwardsUtf8Bytes() { diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index fb313e2d281..cd27551cd9a 100644 --- a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -24,7 +24,7 @@ import static org.junit.Assert.fail; /** * Test of tokenization, with stemming and accent removal * - * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + * @author matskin */ public class OpenNlpTokenizationTestCase { diff --git a/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java index daa3e2a4541..524f1b5b6fe 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java @@ -7,7 +7,7 @@ import org.junit.Test; import static org.junit.Assert.assertEquals; /** - * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + * @author Mathias Mølster Lidal */ public class NormalizationTestCase { diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java index 43b4b711b2b..2d3ac291716 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java @@ -10,7 +10,7 @@ import org.junit.Test; /** * Functional testing of StemList. * - * @author steinar + * @author Steinar Knutsen */ public class StemListTestCase { diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java index 11263ccafe8..a2f51ee7367 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java @@ -11,7 +11,6 @@ import static org.junit.Assert.*; public class TokenTypeTestCase { @Test - @SuppressWarnings("deprecation") public void requireThatValueOfWorks() { for (TokenType type : TokenType.values()) { assertEquals(type, TokenType.valueOf(type.getValue())); @@ -19,7 +18,6 @@ public class TokenTypeTestCase { } @Test - @SuppressWarnings("deprecation") public void requireThatValueOfUnknownIsUnknown() { assertEquals(TokenType.UNKNOWN, TokenType.valueOf(-1)); } diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java index 041a27fb1fc..f99dc5633f5 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java @@ -22,7 +22,7 @@ import static org.junit.Assert.fail; /** * Test of tokenization, with stemming and accent removal * - * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + * @author Mathias Mølster Lidal */ public class TokenizationTestCase { @@ -54,26 +54,24 @@ public class TokenizationTestCase { public void testDoubleWidthTokenization() { // "sony" assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("sony"), null); + List.of("sony"), null); assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false, - Arrays.asList("sony"), null); + List.of("sony"), null); // "SONY" assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("sony"), null); + List.of("sony"), null); assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false, - Arrays.asList("sony"), null); + List.of("sony"), null); // "on" assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("on"), null); + List.of("on"), null); assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false, - Arrays.asList("on"), null); + List.of("on"), null); // "ON" assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("on"), null); + List.of("on"), null); assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false, - Arrays.asList("on"), null); - - + List.of("on"), null); } @Test diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java index fc69fc998a7..fe25e5fe17f 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java @@ -9,7 +9,7 @@ import static org.junit.Assert.assertEquals; /** * Check simple token types. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ public class SimpleTokenTypeTestCase { diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java index 2cebfe26dc7..4c2a8f9f591 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java @@ -6,7 +6,7 @@ import com.yahoo.language.process.StemMode; import org.junit.Test; /** - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen * @author bratseth */ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase { |