diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-04-14 10:08:30 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-04-14 10:08:30 +0200 |
commit | 9ec6d6986ae64496cedc5a23fe2ddb8447eabcd4 (patch) | |
tree | c270c9ba65a121a87deb877510ba527729f20876 /linguistics/src/main/java/com/yahoo/language | |
parent | fd9b726786f4c00b276f2d84fd0a3593a0c406eb (diff) |
No functional changes
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language')
5 files changed, 68 insertions, 109 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java index 7a3f5fa4055..174d16fbd67 100644 --- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java +++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java @@ -14,13 +14,14 @@ import java.util.Locale; public class LinguisticsCase { /** - * <p>The lower casing method to use in Vespa when doing language independent processing of natural language data. - * It is placed in a single place to ensure symmetry between e.g. query processing and indexing.</p> - * <p>Return a lowercased version of the given string. Since this is language independent, this is more of a case - * normalization operation than lowercasing.</p> + * The lower casing method to use in Vespa when doing language independent processing of natural language data. + * It is placed in a single place to ensure symmetry between e.g. query processing and indexing. * - * @param in The string to lowercase. - * @return A string containing only lowercase character. + * Return a lowercased version of the given string. Since this is language independent, this is more of a case + * normalization operation than lowercasing. + * + * @param in the string to lowercase + * @return a string containing only lowercase characters */ public static String toLowerCase(String in) { // def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29 diff --git a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java index 2760f9e673e..05b57937625 100644 --- a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java +++ b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java @@ -2,6 +2,7 @@ package com.yahoo.language; import java.util.Locale; +import java.util.Objects; /** * @author Simon Thoresen Hult @@ -10,25 +11,20 @@ public final class LocaleFactory { private static final Locale UNKNOWN = new Locale("", "", ""); - private LocaleFactory() { - // hide - } + private LocaleFactory() {} /** * Implements a simple parser for RFC5646 language tags. The language tag is parsed into a Locale. * - * @param tag The language tag to parse. - * @return The corresponding Locale. + * @param tag the language tag to parse + * @return the corresponding Locale */ - @SuppressWarnings("ConstantConditions") public static Locale fromLanguageTag(String tag) { - // TODO: Should be replaced by return Locale.forLanguageTag(tag); ? + Objects.requireNonNull(tag, "tag cannot be null"); - tag.getClass(); // throws NullPointerException tag = tag.trim(); - if (tag.isEmpty()) { - return UNKNOWN; - } + if (tag.isEmpty()) return UNKNOWN; + String language = ""; String region = ""; String script = ""; @@ -48,9 +44,7 @@ public final class LocaleFactory { } } } - if (language.isEmpty()) { - return UNKNOWN; - } + if (language.isEmpty()) return UNKNOWN; return new Locale(language, region, script); } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 0837b25c151..a5f77fca0af 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -12,6 +12,8 @@ import java.util.logging.Level; /** * Returns a linguistics implementation based on OpenNlp, * and (optionally, default on) Optimaize for language detection. + * + * @author bratseth */ public class OpenNlpLinguistics extends SimpleLinguistics { diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 93599fa7dbe..e1185cb2457 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -3,21 +3,32 @@ package com.yahoo.language.opennlp; import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; -import com.yahoo.language.process.*; -import com.yahoo.language.simple.*; +import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.process.Transformer; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleToken; +import com.yahoo.language.simple.SimpleTokenType; +import com.yahoo.language.simple.SimpleTokenizer; +import com.yahoo.language.simple.SimpleTransformer; import opennlp.tools.stemmer.Stemmer; import opennlp.tools.stemmer.snowball.SnowballStemmer; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.logging.Logger; -import java.util.logging.Level; +/** + * Tokenizer using OpenNlp + * + * @author matskin + */ public class OpenNlpTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; - private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName()); private final Normalizer normalizer; private final Transformer transformer; private final SimpleTokenizer simpleTokenizer; @@ -35,10 +46,8 @@ public class OpenNlpTokenizer implements Tokenizer { @Override public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); - Stemmer stemmer = getStemmerForLanguage(language, stemMode); - if (stemmer == null) { - return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); - } + Stemmer stemmer = stemmerFor(language, stemMode); + if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); @@ -49,9 +58,7 @@ public class OpenNlpTokenizer implements Tokenizer { if (!prevType.isIndexable() || !nextType.isIndexable()) { String original = input.substring(prev, next); String token = processToken(original, language, stemMode, removeAccents, stemmer); - tokens.add(new SimpleToken(original).setOffset(prev) - .setType(prevType) - .setTokenString(token)); + tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token)); prev = next; prevType = nextType; } @@ -60,89 +67,45 @@ public class OpenNlpTokenizer implements Tokenizer { return tokens; } - private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { - log.log(Level.FINEST, () -> "getStemmerForLanguage '"+language+"' mode: "+stemMode); - if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) { - return null; - } - SnowballStemmer.ALGORITHM alg; - switch (language) { - case DANISH: - alg = SnowballStemmer.ALGORITHM.DANISH; - break; - case DUTCH: - alg = SnowballStemmer.ALGORITHM.DUTCH; - break; - case FINNISH: - alg = SnowballStemmer.ALGORITHM.FINNISH; - break; - case FRENCH: - alg = SnowballStemmer.ALGORITHM.FRENCH; - break; - case GERMAN: - alg = SnowballStemmer.ALGORITHM.GERMAN; - break; - case HUNGARIAN: - alg = SnowballStemmer.ALGORITHM.HUNGARIAN; - break; - case IRISH: - alg = SnowballStemmer.ALGORITHM.IRISH; - break; - case ITALIAN: - alg = SnowballStemmer.ALGORITHM.ITALIAN; - break; - case NORWEGIAN_BOKMAL: - case NORWEGIAN_NYNORSK: - alg = SnowballStemmer.ALGORITHM.NORWEGIAN; - break; - case PORTUGUESE: - alg = SnowballStemmer.ALGORITHM.PORTUGUESE; - break; - case ROMANIAN: - alg = SnowballStemmer.ALGORITHM.ROMANIAN; - break; - case RUSSIAN: - alg = SnowballStemmer.ALGORITHM.RUSSIAN; - break; - case SPANISH: - alg = SnowballStemmer.ALGORITHM.SPANISH; - break; - case SWEDISH: - alg = SnowballStemmer.ALGORITHM.SWEDISH; - break; - case TURKISH: - alg = SnowballStemmer.ALGORITHM.TURKISH; - break; - case ENGLISH: - alg = SnowballStemmer.ALGORITHM.ENGLISH; - break; - default: - return null; - - } - return new SnowballStemmer(alg); - } - private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { - final String original = token; - log.log(Level.FINEST, () -> "processToken '"+original+"'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); - if (stemMode != StemMode.NONE) { - final String oldToken = token; - token = doStemming(token, stemmer); - final String newToken = token; - log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'"); - } - final String result = token; - log.log(Level.FINEST, () -> "processed token is: "+result); - return result; + if (stemMode != StemMode.NONE) + token = stemmer.stem(token).toString(); + return token; + } + + private Stemmer stemmerFor(Language language, StemMode stemMode) { + if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null; + SnowballStemmer.ALGORITHM algorithm = algorithmFor(language); + if (algorithm == null) return null; + return new SnowballStemmer(algorithm); } - private String doStemming(String token, Stemmer stemmer) { - return stemmer.stem(token).toString(); + private SnowballStemmer.ALGORITHM algorithmFor(Language language) { + switch (language) { + case DANISH: return SnowballStemmer.ALGORITHM.DANISH; + case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH; + case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH; + case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH; + case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN; + case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN; + case IRISH: return SnowballStemmer.ALGORITHM.IRISH; + case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN; + case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN; + case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN; + case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE; + case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN; + case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN; + case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH; + case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH; + case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH; + case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH; + default: return null; + } } + } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index 389926f1c1b..e1a04b2985d 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -33,7 +33,6 @@ public class SimpleLinguistics implements Linguistics { private final GramSplitter gramSplitter; @Inject - @SuppressWarnings("deprecation") public SimpleLinguistics() { this.normalizer = new SimpleNormalizer(); this.transformer = new SimpleTransformer(); |