summaryrefslogtreecommitdiffstats
path: root/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java')
-rw-r--r--linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java59
1 files changed, 59 insertions, 0 deletions
diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java
new file mode 100644
index 00000000000..edbbe21ec53
--- /dev/null
+++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java
@@ -0,0 +1,59 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+package com.yahoo.language.sentencepiece;
+
+import com.yahoo.config.FileReference;
+import com.yahoo.language.Language;
+import org.junit.Test;
+
+/**
+ * @author bratseth
+ */
+public class SentencePieceConfigurationTest {
+
+ @Test
+ public void testEnglishTokenization() {
+ var b = new SentencePieceConfig.Builder();
+ addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
+ var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
+ tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence");
+ tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo");
+ }
+
+ @Test
+ public void testNoCollapse() {
+ var b = new SentencePieceConfig.Builder();
+ addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
+ b.collapseUnknowns(false);
+ var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
+ tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo");
+ }
+
+ @Test
+ public void testHighestScore() {
+ var b = new SentencePieceConfig.Builder();
+ addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
+ b.scoring(SentencePieceConfig.Scoring.highestScore);
+ var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
+ tester.assertSegmented("hello", "▁h", "el", "lo");
+ }
+
+ @Test
+ public void testMultiLanguageTokenization() {
+ var b = new SentencePieceConfig.Builder();
+ addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b);
+ addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
+ var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
+ tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト");
+ tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo");
+ tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o");
+ }
+
+ private void addModel(String language, String file, SentencePieceConfig.Builder b) {
+ var mb = new SentencePieceConfig.Model.Builder();
+ mb.language(language);
+ mb.path(new FileReference(file));
+ b.model(mb);
+ }
+
+}