diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2021-12-18 12:05:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-18 12:05:59 +0100 |
commit | db8d449a9f8c93df16874123078c280fb346174f (patch) | |
tree | 9d96823262df3e60d5da93f697758154b1ae93b1 | |
parent | b4f5820672908823982c69260a8a5df3163aa236 (diff) |
Revert "Replace optimaize with OpenNLP language detector [run-systemtest]"
20 files changed, 204 insertions, 259 deletions
diff --git a/application/pom.xml b/application/pom.xml index af25bda0f07..61cea1a1826 100644 --- a/application/pom.xml +++ b/application/pom.xml @@ -97,6 +97,17 @@ <scope>compile</scope> </dependency> <dependency> + <groupId>com.optimaize.languagedetector</groupId> + <artifactId>language-detector</artifactId> + <exclusions> + <exclusion> + <!-- We want to get this via jdisc-core --> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> <groupId>org.antlr</groupId> <artifactId>antlr-runtime</artifactId> </dependency> diff --git a/cloud-tenant-base-dependencies-enforcer/pom.xml b/cloud-tenant-base-dependencies-enforcer/pom.xml index b62d355480f..da50fadbd16 100644 --- a/cloud-tenant-base-dependencies-enforcer/pom.xml +++ b/cloud-tenant-base-dependencies-enforcer/pom.xml @@ -219,6 +219,7 @@ <include>com.ibm.icu:icu4j:57.1:jar:test</include> <include>com.intellij:annotations:12.0:jar:test</include> <include>com.microsoft.onnxruntime:onnxruntime:[${onnxruntime.version}]:jar:test</include> + <include>com.optimaize.languagedetector:language-detector:0.6:jar:test</include> <include>com.thaiopensource:jing:20091111:jar:test</include> <include>com.yahoo.athenz:athenz-auth-core:[${athenz.version}]:jar:test</include> <include>com.yahoo.athenz:athenz-client-common:[${athenz.version}]:jar:test</include> @@ -248,7 +249,7 @@ <include>org.apache.httpcomponents.core5:httpcore5-h2:${httpclient5.version}:jar:test</include> <include>org.apache.httpcomponents:httpclient:4.5.12:jar:test</include> <include>org.apache.httpcomponents:httpcore:4.4.13:jar:test</include> - <include>org.apache.opennlp:opennlp-tools:1.9.3:jar:test</include> + <include>org.apache.opennlp:opennlp-tools:1.8.4:jar:test</include> <include>org.apiguardian:apiguardian-api:1.1.0:jar:test</include> <include>org.codehaus.woodstox:stax2-api:3.1.4:jar:test</include> <include>org.eclipse.jetty.alpn:alpn-api:[${jetty-alpn.version}]:jar:test</include> diff --git a/container-core/src/main/java/com/yahoo/language/provider/DefaultLinguisticsProvider.java b/container-core/src/main/java/com/yahoo/language/provider/DefaultLinguisticsProvider.java index a38f39559f5..563c9a8bdff 100644 --- a/container-core/src/main/java/com/yahoo/language/provider/DefaultLinguisticsProvider.java +++ b/container-core/src/main/java/com/yahoo/language/provider/DefaultLinguisticsProvider.java @@ -18,12 +18,10 @@ import com.yahoo.language.opennlp.OpenNlpLinguistics; public class DefaultLinguisticsProvider implements Provider<Linguistics> { // Use lazy initialization to avoid expensive (memory-wise) instantiation - private final Supplier<Linguistics> linguisticsSupplier; + private final Supplier<Linguistics> linguisticsSupplier = Suppliers.memoize(OpenNlpLinguistics::new); @Inject - public DefaultLinguisticsProvider() { - linguisticsSupplier = Suppliers.memoize(OpenNlpLinguistics::new); - } + public DefaultLinguisticsProvider() { } @Override public Linguistics get() { return linguisticsSupplier.get(); } diff --git a/container-dev/pom.xml b/container-dev/pom.xml index 034081f4620..a76f295a7b1 100644 --- a/container-dev/pom.xml +++ b/container-dev/pom.xml @@ -194,6 +194,10 @@ <version>${project.version}</version> <exclusions> <exclusion> + <groupId>com.optimaize.languagedetector</groupId> + <artifactId>language-detector</artifactId> + </exclusion> + <exclusion> <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> </exclusion> diff --git a/indexinglanguage/pom.xml b/indexinglanguage/pom.xml index 32d8068dfcd..cfc7b09a934 100644 --- a/indexinglanguage/pom.xml +++ b/indexinglanguage/pom.xml @@ -44,11 +44,6 @@ <artifactId>predicate-search-core</artifactId> <version>${project.version}</version> </dependency> - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - <scope>test</scope> - </dependency> </dependencies> <build> <plugins> diff --git a/linguistics/pom.xml b/linguistics/pom.xml index d9ab942a0b8..a09f2ecb031 100644 --- a/linguistics/pom.xml +++ b/linguistics/pom.xml @@ -22,7 +22,6 @@ <dependency> <groupId>org.mockito</groupId> <artifactId>mockito-core</artifactId> - <scope>test</scope> </dependency> <dependency> <groupId>com.yahoo.vespa</groupId> @@ -62,6 +61,10 @@ <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> </dependency> + <dependency> + <groupId>com.optimaize.languagedetector</groupId> + <artifactId>language-detector</artifactId> + </dependency> </dependencies> <build> <plugins> diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java deleted file mode 100644 index aa4387bcc45..00000000000 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator; -import opennlp.tools.langdetect.LanguageDetectorContextGenerator; -import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; -import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; -import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; -import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; - -/** - * Overrides the UrlCharSequenceNormalizer, which has a bad regex, until fixed: https://issues.apache.org/jira/browse/OPENNLP-1350 - * - * @author jonmv - */ -@SuppressWarnings("unused") // Loaded by black magic. -public class LanguageDetectorFactory extends opennlp.tools.langdetect.LanguageDetectorFactory { - - @Override - public LanguageDetectorContextGenerator getContextGenerator() { - return new DefaultLanguageDetectorContextGenerator(1, 3, - EmojiCharSequenceNormalizer.getInstance(), - UrlCharSequenceNormalizer.getInstance(), - TwitterCharSequenceNormalizer.getInstance(), - NumberCharSequenceNormalizer.getInstance(), - ShrinkCharSequenceNormalizer.getInstance()); - } - -} diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java deleted file mode 100644 index 849452aeafd..00000000000 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import com.yahoo.language.Language; -import com.yahoo.language.detect.Detection; -import com.yahoo.language.detect.Detector; -import com.yahoo.language.detect.Hint; -import com.yahoo.language.simple.SimpleDetector; -import opennlp.tools.langdetect.LanguageDetectorConfig; -import opennlp.tools.langdetect.LanguageDetectorME; -import opennlp.tools.langdetect.LanguageDetectorModel; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; - -import static java.nio.charset.StandardCharsets.UTF_8; - -/** - * Detects the language of some sample text using {@link SimpleDetector} for CJK input, and OpenNLP otherwise. - * - * @author jonmv - */ -class OpenNlpDetector implements Detector { - - private static final Object monitor = new Object(); - private static LanguageDetectorModel model; - - private final SimpleDetector simple = new SimpleDetector(); - private final Map<String, Language> languagesByISO3 = new HashMap<>(); - private final LanguageDetectorME detector; - private final LanguageDetectorConfig config; - - OpenNlpDetector() { - detector = new LanguageDetectorME(loadModel()); - config = new LanguageDetectorConfig(); - config.setMinDiff(0.02); - config.setChunkSize(64); - for (Locale locale : Locale.getAvailableLocales()) - languagesByISO3.put(locale.getISO3Language(), Language.fromLocale(locale)); - } - - private static LanguageDetectorModel loadModel() { - synchronized (monitor) { - if (model == null) { - try { - model = new LanguageDetectorModel(OpenNlpDetector.class.getResourceAsStream("/models/langdetect-183.bin")); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - return model; - } - - @Override - public Detection detect(byte[] input, int offset, int length, Hint hint) { - Charset encoding = Charset.forName(simple.guessEncoding(input, offset, length)); - return new Detection(detectLanguage(new String(input, offset, length, encoding)), encoding.name(), false); - } - - @Override - public Detection detect(ByteBuffer input, Hint hint) { - if (input.hasArray()) - return detect(input.array(), input.arrayOffset() + input.position(), input.remaining(), hint); - - byte[] buffer = new byte[input.remaining()]; - input.get(buffer); - return detect(buffer, 0, buffer.length, hint); - } - - @Override - public Detection detect(String input, Hint hint) { - return new Detection(detectLanguage(input), UTF_8.name(), false); - } - - private Language detectLanguage(String input) { - Language simpleGuess = simple.guessLanguage(input); - if (simpleGuess != Language.UNKNOWN) - return simpleGuess; - - var prediction = detector.probingPredictLanguages(input, config).getLanguages()[0]; - return prediction.getConfidence() > 0.03 ? languagesByISO3.getOrDefault(prediction.getLang(), Language.UNKNOWN) - : Language.UNKNOWN; - } - -} diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index c749679024a..a27e726cda8 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -7,21 +7,36 @@ import com.yahoo.language.detect.Detector; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleDetector; import com.yahoo.language.simple.SimpleLinguistics; -import opennlp.tools.langdetect.LanguageDetectorModel; +import java.util.logging.Logger; +import java.util.logging.Level; /** - * Returns a linguistics implementation based on OpenNlp. + * Returns a linguistics implementation based on OpenNlp, + * and (optionally, default on) Optimaize for language detection. * * @author bratseth - * @author jonmv */ public class OpenNlpLinguistics extends SimpleLinguistics { + private static final Logger log = Logger.getLogger(OpenNlpLinguistics.class.getName()); private final Detector detector; - @Inject public OpenNlpLinguistics() { - this.detector = new OpenNlpDetector(); + this(true); + } + + @Inject + public OpenNlpLinguistics(OpennlpLinguisticsConfig config) { + this(config.detector().enableOptimaize()); + } + + public OpenNlpLinguistics(boolean enableOptimaize) { + this(enableOptimaize ? new OptimaizeDetector() : new SimpleDetector()); + log.log(Level.FINE, "using "+(enableOptimaize ? "Optimaize" : "Simple")+" detector"); + } + + private OpenNlpLinguistics(Detector detector) { + this.detector = detector; } @Override diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java new file mode 100644 index 00000000000..83947c795fb --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java @@ -0,0 +1,107 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObjectFactory; +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.simple.SimpleDetector; +import com.yahoo.text.Utf8; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; +import java.util.logging.Logger; +import java.util.logging.Level; + +/** + * Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise. + * + * @author bratseth + */ +public class OptimaizeDetector implements Detector { + + private static final Object initGuard = new Object(); + private static TextObjectFactory textObjectFactory = null; + private static LanguageDetector languageDetector = null; + private static final Logger log = Logger.getLogger(OptimaizeDetector.class.getName()); + + static private void initOptimaize() { + synchronized (initGuard) { + if ((textObjectFactory != null) && (languageDetector != null)) return; + + // origin: https://github.com/optimaize/language-detector + // load all languages: + List<LanguageProfile> languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } + } + + private final SimpleDetector simpleDetector = new SimpleDetector(); + + public OptimaizeDetector() { + initOptimaize(); + } + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + return new Detection(guessLanguage(input, offset, length), simpleDetector.guessEncoding(input), false); + } + + @Override + public Detection detect(ByteBuffer input, Hint hint) { + byte[] buf = new byte[input.remaining()]; + input.get(buf, 0, buf.length); + return detect(buf, 0, buf.length, hint); + } + + @Override + public Detection detect(String input, Hint hint) { + return new Detection(guessLanguage(input), Utf8.getCharset().name(), false); + } + + private Language guessLanguage(byte[] buf, int offset, int length) { + return guessLanguage(Utf8.toString(buf, offset, length)); + } + + public Language guessLanguage(String input) { + if (input == null || input.length() == 0) return Language.UNKNOWN; + + Language result = simpleDetector.guessLanguage(input); + if (result != Language.UNKNOWN) return result; + + return guessLanguageUsingOptimaize(input); + } + + private static Language guessLanguageUsingOptimaize(String input) { + Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input)); + if ( ! result.isPresent()) return Language.UNKNOWN; + log.log(Level.FINE, () -> "guessing language "+result.get()+" from input: "+input); + + return Language.fromLocale(new Locale(result.get().getLanguage())); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java deleted file mode 100644 index 883319e2f8b..00000000000 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import opennlp.tools.util.normalizer.CharSequenceNormalizer; - -import java.util.regex.Pattern; - -/** - * Modifies {@link opennlp.tools.util.normalizer.UrlCharSequenceNormalizer} to avoid the bad email regex. - * - * @author jonmv - */ -public class UrlCharSequenceNormalizer implements CharSequenceNormalizer { - - private static final Pattern URL_REGEX = - Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+"); - private static final Pattern MAIL_REGEX = - Pattern.compile("(?<![-+_.0-9A-Za-z])[-+_.0-9A-Za-z]+@[-0-9A-Za-z]+[-.0-9A-Za-z]+"); - - private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer(); - - public static UrlCharSequenceNormalizer getInstance() { - return INSTANCE; - } - - public CharSequence normalize(CharSequence text) { - String modified = URL_REGEX.matcher(text).replaceAll(" "); - return MAIL_REGEX.matcher(modified).replaceAll(" "); - } - -} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index 61d446cd8d0..53b8ad7ad70 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -130,14 +130,10 @@ public class SimpleDetector implements Detector { } public String guessEncoding(byte[] input) { - return guessEncoding(input, 0, input.length); - } - - public String guessEncoding(byte[] input, int offset, int length) { boolean isUtf8 = true; boolean hasHighs = false; scan: - for (int i = offset; i < offset + length; i++) { + for (int i = 0; i < input.length; i++) { final int l = isLeadingFor(input[i]); if (l < 0 || i + l >= input.length) { hasHighs = true; diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index b10beb8c9af..3ca46dcc4f1 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -2,7 +2,8 @@ package com.yahoo.language.simple; import com.google.inject.Inject; -import com.yahoo.component.AbstractComponent; +import com.yahoo.collections.Tuple2; +import com.yahoo.component.Version; import com.yahoo.language.Linguistics; import com.yahoo.language.detect.Detector; import com.yahoo.language.process.CharacterClasses; @@ -15,6 +16,7 @@ import com.yahoo.language.process.Stemmer; import com.yahoo.language.process.StemmerImpl; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.process.Transformer; +import com.yahoo.vespa.configdefinition.SpecialtokensConfig; import java.util.List; diff --git a/linguistics/src/main/resources/configdefinitions/language.opennlp.opennlp-linguistics.def b/linguistics/src/main/resources/configdefinitions/language.opennlp.opennlp-linguistics.def new file mode 100644 index 00000000000..361a8a5f50c --- /dev/null +++ b/linguistics/src/main/resources/configdefinitions/language.opennlp.opennlp-linguistics.def @@ -0,0 +1,6 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +namespace=language.opennlp + +# Enable Optimaize language detector +detector.enableOptimaize bool default=true + diff --git a/linguistics/src/main/resources/models/langdetect-183.bin b/linguistics/src/main/resources/models/langdetect-183.bin Binary files differdeleted file mode 100644 index c3cde217050..00000000000 --- a/linguistics/src/main/resources/models/langdetect-183.bin +++ /dev/null diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpDetectorTestCase.java deleted file mode 100644 index aaa6b2a6484..00000000000 --- a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpDetectorTestCase.java +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import com.yahoo.language.Language; -import com.yahoo.language.detect.Detector; -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -/** - * @author jonmv - */ -public class OpenNlpDetectorTestCase { - - @Test - public void testDetection() { - Detector detector = new OpenNlpDetector(); - - assertLanguage(Language.UNKNOWN, - "", - detector); - - assertLanguage(Language.UNKNOWN, - "Hello!", - detector); - - // from https://en.wikipedia.org/wiki/Yahoo - assertLanguage(Language.ENGLISH, - "Yahoo became a public company via an initial public offering in April 1996 and its stock price rose 600% within two years.", - detector); - - // from https://de.wikipedia.org/wiki/Yahoo - assertLanguage(Language.GERMAN, - "1996 ging Yahoo mit 46 Angestellten an die Börse. 2009 arbeiteten insgesamt rund 13.500 Mitarbeiter für Yahoo.", - detector); - - // from https://fr.wikipedia.org/wiki/Yahoo - assertLanguage(Language.FRENCH, - "À l'origine, Yahoo! était uniquement un annuaire Web.", - detector); - - // Test fallback to SimpleDetector - assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input - "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002", - detector); - - // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F - assertLanguage(Language.RUSSIAN, - "7 февраля 2000 года Yahoo.com подвергся DDoS атаке и на несколько часов приостановил работу.", - detector); - - // https://he.wikipedia.org/wiki/Yahoo! - assertLanguage(Language.HEBREW, - "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום", - detector); - } - - private void assertLanguage(Language language, String input, Detector detector) { - assertEquals(language, detector.detect(input, null).getLanguage()); - } - -} diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java new file mode 100644 index 00000000000..20b5de3b165 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java @@ -0,0 +1,35 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detector; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author bratseth + */ +public class OptimaizeDetectorTestCase { + + private static final Detector detector = new OptimaizeDetector(); + + @Test + public void testDetection() { + assertLanguage(Language.UNKNOWN, "Hello!"); + + // Test fallback to SimpleDetector + assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input + "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002"); + + // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F + assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии"); + // https://he.wikipedia.org/wiki/Yahoo! + assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום"); + } + + private static void assertLanguage(Language language, String input) { + assertEquals(language, detector.detect(input, null).getLanguage()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizerTest.java b/linguistics/src/test/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizerTest.java deleted file mode 100644 index a8c637bc6ec..00000000000 --- a/linguistics/src/test/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizerTest.java +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -/** - * @author jonmv - */ -public class UrlCharSequenceNormalizerTest { - - @Test - public void testNormalization() { - String text = "xxx+yyy_.dude@mail.com foo bar@baz_bax https://host.tld/path?query=boo a@b §boo@boo"; - assertEquals(" foo _bax a@b § ", - UrlCharSequenceNormalizer.getInstance().normalize(text)); - } - -} diff --git a/parent/pom.xml b/parent/pom.xml index b1dcaab2d83..2cebaf21833 100644 --- a/parent/pom.xml +++ b/parent/pom.xml @@ -496,6 +496,11 @@ <version>${onnxruntime.version}</version> </dependency> <dependency> + <groupId>com.optimaize.languagedetector</groupId> + <artifactId>language-detector</artifactId> + <version>0.6</version> + </dependency> + <dependency> <groupId>com.yahoo.athenz</groupId> <artifactId>athenz-zms-java-client</artifactId> <version>${athenz.version}</version> @@ -699,7 +704,7 @@ <dependency> <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> - <version>1.9.3</version> + <version>1.8.4</version> </dependency> <dependency> <groupId>org.apache.velocity</groupId> diff --git a/vespajlib/src/main/java/com/yahoo/tensor/TensorType.java b/vespajlib/src/main/java/com/yahoo/tensor/TensorType.java index dfbcb06c365..5b23d5d92ae 100644 --- a/vespajlib/src/main/java/com/yahoo/tensor/TensorType.java +++ b/vespajlib/src/main/java/com/yahoo/tensor/TensorType.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.tensor; +import com.google.common.collect.ImmutableList; import com.yahoo.text.Ascii7BitMatcher; import java.util.ArrayList; @@ -85,7 +86,7 @@ public class TensorType { private final Value valueType; /** Sorted list of the dimensions of this */ - private final List<Dimension> dimensions; + private final ImmutableList<Dimension> dimensions; private final TensorType mappedSubtype; @@ -93,7 +94,7 @@ public class TensorType { this.valueType = valueType; List<Dimension> dimensionList = new ArrayList<>(dimensions); Collections.sort(dimensionList); - this.dimensions = List.copyOf(dimensionList); + this.dimensions = ImmutableList.copyOf(dimensionList); if (dimensionList.stream().allMatch(d -> d.isIndexed())) mappedSubtype = empty; |