diff options
15 files changed, 23 insertions, 217 deletions
diff --git a/cloud-tenant-base-dependencies-enforcer/pom.xml b/cloud-tenant-base-dependencies-enforcer/pom.xml index 491a3a09e6e..b62d355480f 100644 --- a/cloud-tenant-base-dependencies-enforcer/pom.xml +++ b/cloud-tenant-base-dependencies-enforcer/pom.xml @@ -193,7 +193,6 @@ <include>com.yahoo.vespa:hosted-api:*:jar:test</include> <include>com.yahoo.vespa:indexinglanguage:*:jar:test</include> <include>com.yahoo.vespa:jdisc_jetty:*:jar:test</include> - <include>com.yahoo.vespa:linguistics-components:*:jar:test</include> <include>com.yahoo.vespa:logd:*:jar:test</include> <include>com.yahoo.vespa:metrics-proxy:*:jar:test</include> <include>com.yahoo.vespa:metrics:*:jar:test</include> diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java index 8396aab022e..e8be43fdc96 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java @@ -105,7 +105,6 @@ public final class ApplicationContainerCluster extends ContainerCluster<Applicat addSimpleComponent("com.yahoo.language.provider.DefaultLinguisticsProvider"); addSimpleComponent("com.yahoo.language.provider.DefaultEmbedderProvider"); - addSimpleComponent("com.yahoo.language.opennlp.LangDetectModel183"); addSimpleComponent("com.yahoo.container.jdisc.SecretStoreProvider"); addSimpleComponent("com.yahoo.container.jdisc.DeprecatedSecretStoreProvider"); addSimpleComponent("com.yahoo.container.jdisc.CertificateStoreProvider"); diff --git a/config-model/src/test/java/com/yahoo/vespa/model/container/ContainerClusterTest.java b/config-model/src/test/java/com/yahoo/vespa/model/container/ContainerClusterTest.java index 94344172a1f..560ac28b6f7 100755 --- a/config-model/src/test/java/com/yahoo/vespa/model/container/ContainerClusterTest.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/container/ContainerClusterTest.java @@ -205,9 +205,7 @@ public class ContainerClusterTest { MockRoot root = createRoot(false); ClusterControllerContainerCluster cluster = createClusterControllerCluster(root); addClusterController(root.deployLogger(), cluster, "host-c1", root.getDeployState()); - assertFalse(contains("com.yahoo.language.provider.DefaultLinguisticsProvider", cluster.getAllComponents())); - assertFalse(contains("com.yahoo.language.opennlp.LangDetectModel183", cluster.getAllComponents())); } private static boolean contains(String componentId, Collection<Component<?, ?>> componentList) { diff --git a/container-core/src/main/java/com/yahoo/language/provider/DefaultLinguisticsProvider.java b/container-core/src/main/java/com/yahoo/language/provider/DefaultLinguisticsProvider.java index 11ed2157401..a38f39559f5 100644 --- a/container-core/src/main/java/com/yahoo/language/provider/DefaultLinguisticsProvider.java +++ b/container-core/src/main/java/com/yahoo/language/provider/DefaultLinguisticsProvider.java @@ -6,7 +6,6 @@ import com.google.common.base.Suppliers; import com.google.inject.Inject; import com.yahoo.container.di.componentgraph.Provider; import com.yahoo.language.Linguistics; -import com.yahoo.language.opennlp.LangDetectModel; import com.yahoo.language.opennlp.OpenNlpLinguistics; /** @@ -22,8 +21,8 @@ public class DefaultLinguisticsProvider implements Provider<Linguistics> { private final Supplier<Linguistics> linguisticsSupplier; @Inject - public DefaultLinguisticsProvider(LangDetectModel detectorModel) { - linguisticsSupplier = Suppliers.memoize(() -> new OpenNlpLinguistics(detectorModel)); + public DefaultLinguisticsProvider() { + linguisticsSupplier = Suppliers.memoize(OpenNlpLinguistics::new); } @Override diff --git a/linguistics-components/src/main/java/com/yahoo/language/opennlp/LangDetectModel183.java b/linguistics-components/src/main/java/com/yahoo/language/opennlp/LangDetectModel183.java deleted file mode 100644 index c9e78259336..00000000000 --- a/linguistics-components/src/main/java/com/yahoo/language/opennlp/LangDetectModel183.java +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import opennlp.tools.langdetect.LanguageDetectorModel; - -import java.io.IOException; -import java.io.UncheckedIOException; - -public class LangDetectModel183 implements LangDetectModel { - - private final Object monitor = new Object(); - private LanguageDetectorModel loaded; - - @Override - public LanguageDetectorModel load() { - synchronized (monitor) { - if (loaded == null) { - try { - loaded = new LanguageDetectorModel(LangDetectModel183.class.getResourceAsStream("/models/langdetect-183.bin")); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - return loaded; - } - -} diff --git a/linguistics-components/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/linguistics-components/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java deleted file mode 100644 index aa4387bcc45..00000000000 --- a/linguistics-components/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator; -import opennlp.tools.langdetect.LanguageDetectorContextGenerator; -import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; -import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; -import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; -import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; - -/** - * Overrides the UrlCharSequenceNormalizer, which has a bad regex, until fixed: https://issues.apache.org/jira/browse/OPENNLP-1350 - * - * @author jonmv - */ -@SuppressWarnings("unused") // Loaded by black magic. -public class LanguageDetectorFactory extends opennlp.tools.langdetect.LanguageDetectorFactory { - - @Override - public LanguageDetectorContextGenerator getContextGenerator() { - return new DefaultLanguageDetectorContextGenerator(1, 3, - EmojiCharSequenceNormalizer.getInstance(), - UrlCharSequenceNormalizer.getInstance(), - TwitterCharSequenceNormalizer.getInstance(), - NumberCharSequenceNormalizer.getInstance(), - ShrinkCharSequenceNormalizer.getInstance()); - } - -} diff --git a/linguistics-components/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java b/linguistics-components/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java deleted file mode 100644 index 883319e2f8b..00000000000 --- a/linguistics-components/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import opennlp.tools.util.normalizer.CharSequenceNormalizer; - -import java.util.regex.Pattern; - -/** - * Modifies {@link opennlp.tools.util.normalizer.UrlCharSequenceNormalizer} to avoid the bad email regex. - * - * @author jonmv - */ -public class UrlCharSequenceNormalizer implements CharSequenceNormalizer { - - private static final Pattern URL_REGEX = - Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+"); - private static final Pattern MAIL_REGEX = - Pattern.compile("(?<![-+_.0-9A-Za-z])[-+_.0-9A-Za-z]+@[-0-9A-Za-z]+[-.0-9A-Za-z]+"); - - private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer(); - - public static UrlCharSequenceNormalizer getInstance() { - return INSTANCE; - } - - public CharSequence normalize(CharSequence text) { - String modified = URL_REGEX.matcher(text).replaceAll(" "); - return MAIL_REGEX.matcher(modified).replaceAll(" "); - } - -} diff --git a/linguistics-components/src/main/java/com/yahoo/language/opennlp/package-info.java b/linguistics-components/src/main/java/com/yahoo/language/opennlp/package-info.java deleted file mode 100644 index 9606578b3ac..00000000000 --- a/linguistics-components/src/main/java/com/yahoo/language/opennlp/package-info.java +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -@ExportPackage -package com.yahoo.language.opennlp; - -import com.yahoo.osgi.annotation.ExportPackage; diff --git a/linguistics-components/src/main/resources/models/langdetect-183.bin b/linguistics-components/src/main/resources/models/langdetect-183.bin Binary files differdeleted file mode 100644 index c3cde217050..00000000000 --- a/linguistics-components/src/main/resources/models/langdetect-183.bin +++ /dev/null diff --git a/linguistics-components/src/test/java/com/yahoo/language/opennlp/OpenNlpDetectorTestCase.java b/linguistics-components/src/test/java/com/yahoo/language/opennlp/OpenNlpDetectorTestCase.java deleted file mode 100644 index 5b095ae0681..00000000000 --- a/linguistics-components/src/test/java/com/yahoo/language/opennlp/OpenNlpDetectorTestCase.java +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import com.yahoo.language.Language; -import com.yahoo.language.detect.Detector; -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -/** - * @author jonmv - */ -public class OpenNlpDetectorTestCase { - - @Test - public void testDetection() { - Detector detector = new OpenNlpDetector(new LangDetectModel183().load()); - - assertLanguage(Language.UNKNOWN, - "", - detector); - - assertLanguage(Language.UNKNOWN, - "Hello!", - detector); - - // from https://en.wikipedia.org/wiki/Yahoo - assertLanguage(Language.ENGLISH, - "Yahoo became a public company via an initial public offering in April 1996 and its stock price rose 600% within two years.", - detector); - - // from https://de.wikipedia.org/wiki/Yahoo - assertLanguage(Language.GERMAN, - "1996 ging Yahoo mit 46 Angestellten an die Börse. 2009 arbeiteten insgesamt rund 13.500 Mitarbeiter für Yahoo.", - detector); - - // from https://fr.wikipedia.org/wiki/Yahoo - assertLanguage(Language.FRENCH, - "À l'origine, Yahoo! était uniquement un annuaire Web.", - detector); - - // Test fallback to SimpleDetector - assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input - "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002", - detector); - - // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F - assertLanguage(Language.RUSSIAN, - "7 февраля 2000 года Yahoo.com подвергся DDoS атаке и на несколько часов приостановил работу.", - detector); - - // https://he.wikipedia.org/wiki/Yahoo! - assertLanguage(Language.HEBREW, - "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום", - detector); - } - - private void assertLanguage(Language language, String input, Detector detector) { - assertEquals(language, detector.detect(input, null).getLanguage()); - } - -} diff --git a/linguistics-components/src/test/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizerTest.java b/linguistics-components/src/test/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizerTest.java deleted file mode 100644 index a8c637bc6ec..00000000000 --- a/linguistics-components/src/test/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizerTest.java +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -/** - * @author jonmv - */ -public class UrlCharSequenceNormalizerTest { - - @Test - public void testNormalization() { - String text = "xxx+yyy_.dude@mail.com foo bar@baz_bax https://host.tld/path?query=boo a@b §boo@boo"; - assertEquals(" foo _bax a@b § ", - UrlCharSequenceNormalizer.getInstance().normalize(text)); - } - -} diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/LangDetectModel.java b/linguistics/src/main/java/com/yahoo/language/opennlp/LangDetectModel.java deleted file mode 100644 index 144b4612005..00000000000 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/LangDetectModel.java +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import opennlp.tools.langdetect.LanguageDetectorModel; - -/** - * Wrapper to lazily load a langdetect model for OpenNLP. - * - * @author jonmv - */ -public interface LangDetectModel { - - /** Loads a {@link LanguageDetectorModel}, or throws if this fails. */ - LanguageDetectorModel load(); - -} diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java index e0c0960b920..849452aeafd 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java @@ -6,7 +6,6 @@ import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.language.simple.SimpleDetector; -import opennlp.tools.cmdline.langdetect.LanguageDetectorModelLoader; import opennlp.tools.langdetect.LanguageDetectorConfig; import opennlp.tools.langdetect.LanguageDetectorME; import opennlp.tools.langdetect.LanguageDetectorModel; @@ -15,7 +14,6 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.nio.ByteBuffer; import java.nio.charset.Charset; -import java.nio.file.Paths; import java.util.HashMap; import java.util.Locale; import java.util.Map; @@ -29,13 +27,16 @@ import static java.nio.charset.StandardCharsets.UTF_8; */ class OpenNlpDetector implements Detector { + private static final Object monitor = new Object(); + private static LanguageDetectorModel model; + private final SimpleDetector simple = new SimpleDetector(); private final Map<String, Language> languagesByISO3 = new HashMap<>(); private final LanguageDetectorME detector; private final LanguageDetectorConfig config; - OpenNlpDetector(LanguageDetectorModel model) { - detector = new LanguageDetectorME(model); + OpenNlpDetector() { + detector = new LanguageDetectorME(loadModel()); config = new LanguageDetectorConfig(); config.setMinDiff(0.02); config.setChunkSize(64); @@ -43,6 +44,20 @@ class OpenNlpDetector implements Detector { languagesByISO3.put(locale.getISO3Language(), Language.fromLocale(locale)); } + private static LanguageDetectorModel loadModel() { + synchronized (monitor) { + if (model == null) { + try { + model = new LanguageDetectorModel(OpenNlpDetector.class.getResourceAsStream("/models/langdetect-183.bin")); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + return model; + } + @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { Charset encoding = Charset.forName(simple.guessEncoding(input, offset, length)); diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 1f4ec4e261b..c749679024a 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -20,8 +20,8 @@ public class OpenNlpLinguistics extends SimpleLinguistics { private final Detector detector; @Inject - public OpenNlpLinguistics(LangDetectModel model) { - this.detector = new OpenNlpDetector(model.load()); + public OpenNlpLinguistics() { + this.detector = new OpenNlpDetector(); } @Override diff --git a/standalone-container/pom.xml b/standalone-container/pom.xml index 26eac432f67..a605bdb7d39 100644 --- a/standalone-container/pom.xml +++ b/standalone-container/pom.xml @@ -15,18 +15,6 @@ <dependencies> <dependency> <groupId>com.yahoo.vespa</groupId> - <artifactId>linguistics-components</artifactId> - <version>${project.version}</version> - <scope>compile</scope> - <exclusions> - <exclusion> - <groupId>com.yahoo.vespa</groupId> - <artifactId>linguistics</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>com.yahoo.vespa</groupId> <artifactId>config-provisioning</artifactId> <version>${project.version}</version> <scope>provided</scope> |