aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-04-14 10:08:30 +0200
committerJon Bratseth <bratseth@gmail.com>2021-04-14 10:08:30 +0200
commit9ec6d6986ae64496cedc5a23fe2ddb8447eabcd4 (patch)
treec270c9ba65a121a87deb877510ba527729f20876 /linguistics/src/main/java/com/yahoo/language
parentfd9b726786f4c00b276f2d84fd0a3593a0c406eb (diff)
No functional changes
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java13
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LocaleFactory.java22
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java139
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java1
5 files changed, 68 insertions, 109 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
index 7a3f5fa4055..174d16fbd67 100644
--- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
+++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
@@ -14,13 +14,14 @@ import java.util.Locale;
public class LinguisticsCase {
/**
- * <p>The lower casing method to use in Vespa when doing language independent processing of natural language data.
- * It is placed in a single place to ensure symmetry between e.g. query processing and indexing.</p>
- * <p>Return a lowercased version of the given string. Since this is language independent, this is more of a case
- * normalization operation than lowercasing.</p>
+ * The lower casing method to use in Vespa when doing language independent processing of natural language data.
+ * It is placed in a single place to ensure symmetry between e.g. query processing and indexing.
*
- * @param in The string to lowercase.
- * @return A string containing only lowercase character.
+ * Return a lowercased version of the given string. Since this is language independent, this is more of a case
+ * normalization operation than lowercasing.
+ *
+ * @param in the string to lowercase
+ * @return a string containing only lowercase characters
*/
public static String toLowerCase(String in) {
// def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29
diff --git a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java
index 2760f9e673e..05b57937625 100644
--- a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java
+++ b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java
@@ -2,6 +2,7 @@
package com.yahoo.language;
import java.util.Locale;
+import java.util.Objects;
/**
* @author Simon Thoresen Hult
@@ -10,25 +11,20 @@ public final class LocaleFactory {
private static final Locale UNKNOWN = new Locale("", "", "");
- private LocaleFactory() {
- // hide
- }
+ private LocaleFactory() {}
/**
* Implements a simple parser for RFC5646 language tags. The language tag is parsed into a Locale.
*
- * @param tag The language tag to parse.
- * @return The corresponding Locale.
+ * @param tag the language tag to parse
+ * @return the corresponding Locale
*/
- @SuppressWarnings("ConstantConditions")
public static Locale fromLanguageTag(String tag) {
- // TODO: Should be replaced by return Locale.forLanguageTag(tag); ?
+ Objects.requireNonNull(tag, "tag cannot be null");
- tag.getClass(); // throws NullPointerException
tag = tag.trim();
- if (tag.isEmpty()) {
- return UNKNOWN;
- }
+ if (tag.isEmpty()) return UNKNOWN;
+
String language = "";
String region = "";
String script = "";
@@ -48,9 +44,7 @@ public final class LocaleFactory {
}
}
}
- if (language.isEmpty()) {
- return UNKNOWN;
- }
+ if (language.isEmpty()) return UNKNOWN;
return new Locale(language, region, script);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
index 0837b25c151..a5f77fca0af 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
@@ -12,6 +12,8 @@ import java.util.logging.Level;
/**
* Returns a linguistics implementation based on OpenNlp,
* and (optionally, default on) Optimaize for language detection.
+ *
+ * @author bratseth
*/
public class OpenNlpLinguistics extends SimpleLinguistics {
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 93599fa7dbe..e1185cb2457 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -3,21 +3,32 @@ package com.yahoo.language.opennlp;
import com.yahoo.language.Language;
import com.yahoo.language.LinguisticsCase;
-import com.yahoo.language.process.*;
-import com.yahoo.language.simple.*;
+import com.yahoo.language.process.Normalizer;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenType;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.process.Transformer;
+import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleToken;
+import com.yahoo.language.simple.SimpleTokenType;
+import com.yahoo.language.simple.SimpleTokenizer;
+import com.yahoo.language.simple.SimpleTransformer;
import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-import java.util.logging.Logger;
-import java.util.logging.Level;
+/**
+ * Tokenizer using OpenNlp
+ *
+ * @author matskin
+ */
public class OpenNlpTokenizer implements Tokenizer {
private final static int SPACE_CODE = 32;
- private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName());
private final Normalizer normalizer;
private final Transformer transformer;
private final SimpleTokenizer simpleTokenizer;
@@ -35,10 +46,8 @@ public class OpenNlpTokenizer implements Tokenizer {
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
if (input.isEmpty()) return Collections.emptyList();
- Stemmer stemmer = getStemmerForLanguage(language, stemMode);
- if (stemmer == null) {
- return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
- }
+ Stemmer stemmer = stemmerFor(language, stemMode);
+ if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
@@ -49,9 +58,7 @@ public class OpenNlpTokenizer implements Tokenizer {
if (!prevType.isIndexable() || !nextType.isIndexable()) {
String original = input.substring(prev, next);
String token = processToken(original, language, stemMode, removeAccents, stemmer);
- tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token));
prev = next;
prevType = nextType;
}
@@ -60,89 +67,45 @@ public class OpenNlpTokenizer implements Tokenizer {
return tokens;
}
- private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) {
- log.log(Level.FINEST, () -> "getStemmerForLanguage '"+language+"' mode: "+stemMode);
- if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) {
- return null;
- }
- SnowballStemmer.ALGORITHM alg;
- switch (language) {
- case DANISH:
- alg = SnowballStemmer.ALGORITHM.DANISH;
- break;
- case DUTCH:
- alg = SnowballStemmer.ALGORITHM.DUTCH;
- break;
- case FINNISH:
- alg = SnowballStemmer.ALGORITHM.FINNISH;
- break;
- case FRENCH:
- alg = SnowballStemmer.ALGORITHM.FRENCH;
- break;
- case GERMAN:
- alg = SnowballStemmer.ALGORITHM.GERMAN;
- break;
- case HUNGARIAN:
- alg = SnowballStemmer.ALGORITHM.HUNGARIAN;
- break;
- case IRISH:
- alg = SnowballStemmer.ALGORITHM.IRISH;
- break;
- case ITALIAN:
- alg = SnowballStemmer.ALGORITHM.ITALIAN;
- break;
- case NORWEGIAN_BOKMAL:
- case NORWEGIAN_NYNORSK:
- alg = SnowballStemmer.ALGORITHM.NORWEGIAN;
- break;
- case PORTUGUESE:
- alg = SnowballStemmer.ALGORITHM.PORTUGUESE;
- break;
- case ROMANIAN:
- alg = SnowballStemmer.ALGORITHM.ROMANIAN;
- break;
- case RUSSIAN:
- alg = SnowballStemmer.ALGORITHM.RUSSIAN;
- break;
- case SPANISH:
- alg = SnowballStemmer.ALGORITHM.SPANISH;
- break;
- case SWEDISH:
- alg = SnowballStemmer.ALGORITHM.SWEDISH;
- break;
- case TURKISH:
- alg = SnowballStemmer.ALGORITHM.TURKISH;
- break;
- case ENGLISH:
- alg = SnowballStemmer.ALGORITHM.ENGLISH;
- break;
- default:
- return null;
-
- }
- return new SnowballStemmer(alg);
- }
-
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
Stemmer stemmer) {
- final String original = token;
- log.log(Level.FINEST, () -> "processToken '"+original+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE) {
- final String oldToken = token;
- token = doStemming(token, stemmer);
- final String newToken = token;
- log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'");
- }
- final String result = token;
- log.log(Level.FINEST, () -> "processed token is: "+result);
- return result;
+ if (stemMode != StemMode.NONE)
+ token = stemmer.stem(token).toString();
+ return token;
+ }
+
+ private Stemmer stemmerFor(Language language, StemMode stemMode) {
+ if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null;
+ SnowballStemmer.ALGORITHM algorithm = algorithmFor(language);
+ if (algorithm == null) return null;
+ return new SnowballStemmer(algorithm);
}
- private String doStemming(String token, Stemmer stemmer) {
- return stemmer.stem(token).toString();
+ private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
+ switch (language) {
+ case DANISH: return SnowballStemmer.ALGORITHM.DANISH;
+ case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH;
+ case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH;
+ case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH;
+ case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN;
+ case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN;
+ case IRISH: return SnowballStemmer.ALGORITHM.IRISH;
+ case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN;
+ case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE;
+ case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN;
+ case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN;
+ case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH;
+ case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH;
+ case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH;
+ case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH;
+ default: return null;
+ }
}
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index 389926f1c1b..e1a04b2985d 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -33,7 +33,6 @@ public class SimpleLinguistics implements Linguistics {
private final GramSplitter gramSplitter;
@Inject
- @SuppressWarnings("deprecation")
public SimpleLinguistics() {
this.normalizer = new SimpleNormalizer();
this.transformer = new SimpleTransformer();