summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-04-14 10:08:30 +0200
committerJon Bratseth <bratseth@gmail.com>2021-04-14 10:08:30 +0200
commit9ec6d6986ae64496cedc5a23fe2ddb8447eabcd4 (patch)
treec270c9ba65a121a87deb877510ba527729f20876 /linguistics
parentfd9b726786f4c00b276f2d84fd0a3593a0c406eb (diff)
No functional changes
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java13
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LocaleFactory.java22
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java139
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java1
-rw-r--r--linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java3
-rw-r--r--linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java2
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java2
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java2
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java2
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java20
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java2
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java2
13 files changed, 84 insertions, 128 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
index 7a3f5fa4055..174d16fbd67 100644
--- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
+++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
@@ -14,13 +14,14 @@ import java.util.Locale;
public class LinguisticsCase {
/**
- * <p>The lower casing method to use in Vespa when doing language independent processing of natural language data.
- * It is placed in a single place to ensure symmetry between e.g. query processing and indexing.</p>
- * <p>Return a lowercased version of the given string. Since this is language independent, this is more of a case
- * normalization operation than lowercasing.</p>
+ * The lower casing method to use in Vespa when doing language independent processing of natural language data.
+ * It is placed in a single place to ensure symmetry between e.g. query processing and indexing.
*
- * @param in The string to lowercase.
- * @return A string containing only lowercase character.
+ * Return a lowercased version of the given string. Since this is language independent, this is more of a case
+ * normalization operation than lowercasing.
+ *
+ * @param in the string to lowercase
+ * @return a string containing only lowercase characters
*/
public static String toLowerCase(String in) {
// def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29
diff --git a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java
index 2760f9e673e..05b57937625 100644
--- a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java
+++ b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java
@@ -2,6 +2,7 @@
package com.yahoo.language;
import java.util.Locale;
+import java.util.Objects;
/**
* @author Simon Thoresen Hult
@@ -10,25 +11,20 @@ public final class LocaleFactory {
private static final Locale UNKNOWN = new Locale("", "", "");
- private LocaleFactory() {
- // hide
- }
+ private LocaleFactory() {}
/**
* Implements a simple parser for RFC5646 language tags. The language tag is parsed into a Locale.
*
- * @param tag The language tag to parse.
- * @return The corresponding Locale.
+ * @param tag the language tag to parse
+ * @return the corresponding Locale
*/
- @SuppressWarnings("ConstantConditions")
public static Locale fromLanguageTag(String tag) {
- // TODO: Should be replaced by return Locale.forLanguageTag(tag); ?
+ Objects.requireNonNull(tag, "tag cannot be null");
- tag.getClass(); // throws NullPointerException
tag = tag.trim();
- if (tag.isEmpty()) {
- return UNKNOWN;
- }
+ if (tag.isEmpty()) return UNKNOWN;
+
String language = "";
String region = "";
String script = "";
@@ -48,9 +44,7 @@ public final class LocaleFactory {
}
}
}
- if (language.isEmpty()) {
- return UNKNOWN;
- }
+ if (language.isEmpty()) return UNKNOWN;
return new Locale(language, region, script);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
index 0837b25c151..a5f77fca0af 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
@@ -12,6 +12,8 @@ import java.util.logging.Level;
/**
* Returns a linguistics implementation based on OpenNlp,
* and (optionally, default on) Optimaize for language detection.
+ *
+ * @author bratseth
*/
public class OpenNlpLinguistics extends SimpleLinguistics {
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 93599fa7dbe..e1185cb2457 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -3,21 +3,32 @@ package com.yahoo.language.opennlp;
import com.yahoo.language.Language;
import com.yahoo.language.LinguisticsCase;
-import com.yahoo.language.process.*;
-import com.yahoo.language.simple.*;
+import com.yahoo.language.process.Normalizer;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenType;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.process.Transformer;
+import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleToken;
+import com.yahoo.language.simple.SimpleTokenType;
+import com.yahoo.language.simple.SimpleTokenizer;
+import com.yahoo.language.simple.SimpleTransformer;
import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-import java.util.logging.Logger;
-import java.util.logging.Level;
+/**
+ * Tokenizer using OpenNlp
+ *
+ * @author matskin
+ */
public class OpenNlpTokenizer implements Tokenizer {
private final static int SPACE_CODE = 32;
- private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName());
private final Normalizer normalizer;
private final Transformer transformer;
private final SimpleTokenizer simpleTokenizer;
@@ -35,10 +46,8 @@ public class OpenNlpTokenizer implements Tokenizer {
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
if (input.isEmpty()) return Collections.emptyList();
- Stemmer stemmer = getStemmerForLanguage(language, stemMode);
- if (stemmer == null) {
- return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
- }
+ Stemmer stemmer = stemmerFor(language, stemMode);
+ if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
@@ -49,9 +58,7 @@ public class OpenNlpTokenizer implements Tokenizer {
if (!prevType.isIndexable() || !nextType.isIndexable()) {
String original = input.substring(prev, next);
String token = processToken(original, language, stemMode, removeAccents, stemmer);
- tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token));
prev = next;
prevType = nextType;
}
@@ -60,89 +67,45 @@ public class OpenNlpTokenizer implements Tokenizer {
return tokens;
}
- private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) {
- log.log(Level.FINEST, () -> "getStemmerForLanguage '"+language+"' mode: "+stemMode);
- if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) {
- return null;
- }
- SnowballStemmer.ALGORITHM alg;
- switch (language) {
- case DANISH:
- alg = SnowballStemmer.ALGORITHM.DANISH;
- break;
- case DUTCH:
- alg = SnowballStemmer.ALGORITHM.DUTCH;
- break;
- case FINNISH:
- alg = SnowballStemmer.ALGORITHM.FINNISH;
- break;
- case FRENCH:
- alg = SnowballStemmer.ALGORITHM.FRENCH;
- break;
- case GERMAN:
- alg = SnowballStemmer.ALGORITHM.GERMAN;
- break;
- case HUNGARIAN:
- alg = SnowballStemmer.ALGORITHM.HUNGARIAN;
- break;
- case IRISH:
- alg = SnowballStemmer.ALGORITHM.IRISH;
- break;
- case ITALIAN:
- alg = SnowballStemmer.ALGORITHM.ITALIAN;
- break;
- case NORWEGIAN_BOKMAL:
- case NORWEGIAN_NYNORSK:
- alg = SnowballStemmer.ALGORITHM.NORWEGIAN;
- break;
- case PORTUGUESE:
- alg = SnowballStemmer.ALGORITHM.PORTUGUESE;
- break;
- case ROMANIAN:
- alg = SnowballStemmer.ALGORITHM.ROMANIAN;
- break;
- case RUSSIAN:
- alg = SnowballStemmer.ALGORITHM.RUSSIAN;
- break;
- case SPANISH:
- alg = SnowballStemmer.ALGORITHM.SPANISH;
- break;
- case SWEDISH:
- alg = SnowballStemmer.ALGORITHM.SWEDISH;
- break;
- case TURKISH:
- alg = SnowballStemmer.ALGORITHM.TURKISH;
- break;
- case ENGLISH:
- alg = SnowballStemmer.ALGORITHM.ENGLISH;
- break;
- default:
- return null;
-
- }
- return new SnowballStemmer(alg);
- }
-
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
Stemmer stemmer) {
- final String original = token;
- log.log(Level.FINEST, () -> "processToken '"+original+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE) {
- final String oldToken = token;
- token = doStemming(token, stemmer);
- final String newToken = token;
- log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'");
- }
- final String result = token;
- log.log(Level.FINEST, () -> "processed token is: "+result);
- return result;
+ if (stemMode != StemMode.NONE)
+ token = stemmer.stem(token).toString();
+ return token;
+ }
+
+ private Stemmer stemmerFor(Language language, StemMode stemMode) {
+ if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null;
+ SnowballStemmer.ALGORITHM algorithm = algorithmFor(language);
+ if (algorithm == null) return null;
+ return new SnowballStemmer(algorithm);
}
- private String doStemming(String token, Stemmer stemmer) {
- return stemmer.stem(token).toString();
+ private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
+ switch (language) {
+ case DANISH: return SnowballStemmer.ALGORITHM.DANISH;
+ case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH;
+ case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH;
+ case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH;
+ case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN;
+ case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN;
+ case IRISH: return SnowballStemmer.ALGORITHM.IRISH;
+ case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN;
+ case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE;
+ case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN;
+ case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN;
+ case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH;
+ case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH;
+ case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH;
+ case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH;
+ default: return null;
+ }
}
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index 389926f1c1b..e1a04b2985d 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -33,7 +33,6 @@ public class SimpleLinguistics implements Linguistics {
private final GramSplitter gramSplitter;
@Inject
- @SuppressWarnings("deprecation")
public SimpleLinguistics() {
this.normalizer = new SimpleNormalizer();
this.transformer = new SimpleTransformer();
diff --git a/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java
index c0f1b92a6bf..f2891a0c5d5 100644
--- a/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java
@@ -6,6 +6,7 @@ import org.junit.Test;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import static org.junit.Assert.*;
@@ -15,7 +16,7 @@ import static org.junit.Assert.*;
public class AbstractDetectorTestCase {
private static final Detection DETECTION = new Detection(Language.ARABIC, "encoding", true);
- private static final Charset UTF8 = Charset.forName("UTF-8");
+ private static final Charset UTF8 = StandardCharsets.UTF_8;
@Test
public void requireThatDetectStringForwardsUtf8Bytes() {
diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index fb313e2d281..cd27551cd9a 100644
--- a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -24,7 +24,7 @@ import static org.junit.Assert.fail;
/**
* Test of tokenization, with stemming and accent removal
*
- * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ * @author matskin
*/
public class OpenNlpTokenizationTestCase {
diff --git a/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java
index daa3e2a4541..524f1b5b6fe 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java
@@ -7,7 +7,7 @@ import org.junit.Test;
import static org.junit.Assert.assertEquals;
/**
- * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ * @author Mathias Mølster Lidal
*/
public class NormalizationTestCase {
diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java
index 43b4b711b2b..2d3ac291716 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java
@@ -10,7 +10,7 @@ import org.junit.Test;
/**
* Functional testing of StemList.
*
- * @author steinar
+ * @author Steinar Knutsen
*/
public class StemListTestCase {
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
index 11263ccafe8..a2f51ee7367 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
@@ -11,7 +11,6 @@ import static org.junit.Assert.*;
public class TokenTypeTestCase {
@Test
- @SuppressWarnings("deprecation")
public void requireThatValueOfWorks() {
for (TokenType type : TokenType.values()) {
assertEquals(type, TokenType.valueOf(type.getValue()));
@@ -19,7 +18,6 @@ public class TokenTypeTestCase {
}
@Test
- @SuppressWarnings("deprecation")
public void requireThatValueOfUnknownIsUnknown() {
assertEquals(TokenType.UNKNOWN, TokenType.valueOf(-1));
}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
index 041a27fb1fc..f99dc5633f5 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
@@ -22,7 +22,7 @@ import static org.junit.Assert.fail;
/**
* Test of tokenization, with stemming and accent removal
*
- * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ * @author Mathias Mølster Lidal
*/
public class TokenizationTestCase {
@@ -54,26 +54,24 @@ public class TokenizationTestCase {
public void testDoubleWidthTokenization() {
// "sony"
assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false,
- Arrays.asList("sony"), null);
+ List.of("sony"), null);
assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false,
- Arrays.asList("sony"), null);
+ List.of("sony"), null);
// "SONY"
assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false,
- Arrays.asList("sony"), null);
+ List.of("sony"), null);
assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false,
- Arrays.asList("sony"), null);
+ List.of("sony"), null);
// "on"
assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false,
- Arrays.asList("on"), null);
+ List.of("on"), null);
assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false,
- Arrays.asList("on"), null);
+ List.of("on"), null);
// "ON"
assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false,
- Arrays.asList("on"), null);
+ List.of("on"), null);
assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false,
- Arrays.asList("on"), null);
-
-
+ List.of("on"), null);
}
@Test
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java
index fc69fc998a7..fe25e5fe17f 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java
@@ -9,7 +9,7 @@ import static org.junit.Assert.assertEquals;
/**
* Check simple token types.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
public class SimpleTokenTypeTestCase {
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
index 2cebfe26dc7..4c2a8f9f591 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -6,7 +6,7 @@ import com.yahoo.language.process.StemMode;
import org.junit.Test;
/**
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
* @author bratseth
*/
public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {