From 1abb5adacbdbcfad7070243630164e4d31f68773 Mon Sep 17 00:00:00 2001
From: Jon Bratseth <bratseth@gmail.com>
Date: Sat, 25 Sep 2021 14:50:33 +0000
Subject: Separate component from linguistics

---
 .../SentencePieceConfigurationTest.java            |  59 ++++++++++++++
 .../language/sentencepiece/SentencePieceTest.java  |  89 +++++++++++++++++++++
 .../sentencepiece/SentencePieceTester.java         |  49 ++++++++++++
 .../models/sentencepiece/en.wiki.bpe.vs10000.model | Bin 0 -> 400869 bytes
 .../models/sentencepiece/ja.wiki.bpe.vs5000.model  | Bin 0 -> 300865 bytes
 5 files changed, 197 insertions(+)
 create mode 100644 linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java
 create mode 100644 linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java
 create mode 100644 linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java
 create mode 100644 linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model
 create mode 100644 linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model

(limited to 'linguistics-components/src/test')

diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java
new file mode 100644
index 00000000000..edbbe21ec53
--- /dev/null
+++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java
@@ -0,0 +1,59 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+package com.yahoo.language.sentencepiece;
+
+import com.yahoo.config.FileReference;
+import com.yahoo.language.Language;
+import org.junit.Test;
+
+/**
+ * @author bratseth
+ */
+public class SentencePieceConfigurationTest {
+
+    @Test
+    public void testEnglishTokenization() {
+        var b = new SentencePieceConfig.Builder();
+        addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
+        var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
+        tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence");
+        tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo");
+    }
+
+    @Test
+    public void testNoCollapse() {
+        var b = new SentencePieceConfig.Builder();
+        addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
+        b.collapseUnknowns(false);
+        var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
+        tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo");
+    }
+
+    @Test
+    public void testHighestScore() {
+        var b = new SentencePieceConfig.Builder();
+        addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
+        b.scoring(SentencePieceConfig.Scoring.highestScore);
+        var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
+        tester.assertSegmented("hello", "▁h", "el", "lo");
+    }
+
+    @Test
+    public void testMultiLanguageTokenization() {
+        var b = new SentencePieceConfig.Builder();
+        addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b);
+        addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
+        var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
+        tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト");
+        tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo");
+        tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o");
+    }
+
+    private void addModel(String language, String file, SentencePieceConfig.Builder b) {
+        var mb = new SentencePieceConfig.Model.Builder();
+        mb.language(language);
+        mb.path(new FileReference(file));
+        b.model(mb);
+    }
+
+}
diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java
new file mode 100644
index 00000000000..d60d7386d4b
--- /dev/null
+++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java
@@ -0,0 +1,89 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+package com.yahoo.language.sentencepiece;
+
+import com.yahoo.language.Language;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * @author bratseth
+ */
+public class SentencePieceTest {
+
+    @Test
+    public void testEnglishTokenization() {
+        var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath());
+        tester.assertSegmented("h", "▁h");
+        tester.assertSegmented("he", "▁he");
+        tester.assertSegmented("hel", "▁hel");
+        tester.assertSegmented("hello", "▁hel", "lo");
+        tester.assertSegmented("hei", "▁he", "i");
+        tester.assertSegmented("hei you", "▁he", "i", "▁you");
+        tester.assertSegmented("hei  you", "▁he", "i", "▁you");
+        tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence");
+        tester.assertSegmented("hello world!", "▁hel", "lo", "▁world", "!");
+        tester.assertSegmented("Hello, world!", "▁", "H", "ello", ",", "▁world", "!");
+        tester.assertSegmented("HELLO, world!", "▁", "HELLO", ",", "▁world", "!");
+        tester.assertSegmented("KHJKJHHKJHHSH", "▁", "KHJKJHHKJHHSH");
+        tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo");
+        tester.assertSegmented("  hello  ", "▁hel", "lo");
+        tester.assertSegmented(")(/&#()/\"\")", "▁)", "(", "/", "&", "#", "(", ")", "/", "\"", "\")");
+        tester.assertSegmented(")(/&#(small)/\"in quotes\")", "▁)", "(", "/", "&", "#", "(", "sm", "all", ")", "/", "\"", "in", "▁qu", "otes", "\")");
+        tester.assertSegmented("x.400AS", "▁x", ".", "4", "00", "AS");
+        tester.assertSegmented("A normal sentence. Yes one more.", "▁", "A", "▁normal", "▁sentence", ".", "▁", "Y", "es", "▁one", "▁more", ".");
+    }
+
+    @Test
+    public void testIntegerListEncoding() {
+        var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath());
+        tester.assertEncoded("hello, world!", 908, 1418, 9934, 501, 9960);
+        tester.assertEncoded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960);
+    }
+
+    @Test
+    public void testDenseTensorEncoding() {
+        var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath());
+        tester.assertEncoded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]");
+        tester.assertEncoded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]");
+        tester.assertEncoded("hello, world!", "tensor(d[2])", "[908,1418]");
+    }
+
+    @Test
+    public void testSparseTensorEncoding() {
+        var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath());
+        tester.assertEncoded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}");
+    }
+
+    @Test
+    public void testNoCollapse() {
+        var tester = new SentencePieceTester(new SentencePieceEncoder.Builder()
+                                                     .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath())
+                                                     .setCollapseUnknowns(false));
+        tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo");
+    }
+
+    @Test
+    public void testHighestScore() {
+        var tester = new SentencePieceTester(new SentencePieceEncoder.Builder()
+                                                     .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath())
+                                                     .setScoring(Scoring.highestScore));
+        tester.assertSegmented("h", "▁h");
+        tester.assertSegmented("he", "▁he");
+        tester.assertSegmented("hel", "▁h", "el");
+        tester.assertSegmented("hello", "▁h", "el", "lo");
+    }
+
+    @Test
+    public void testMultiLanguageTokenization() {
+        SentencePieceEncoder.Builder builder = new SentencePieceEncoder.Builder();
+        builder.addModel(Language.JAPANESE, new File("src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model").toPath());
+        builder.addModel(Language.ENGLISH, new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath());
+        var tester = new SentencePieceTester(builder);
+        tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト");
+        tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo");
+        tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o");
+    }
+
+}
diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java
new file mode 100644
index 00000000000..1ba7c9b472d
--- /dev/null
+++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java
@@ -0,0 +1,49 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+//
+
+package com.yahoo.language.sentencepiece;
+
+import com.yahoo.language.Language;
+import com.yahoo.tensor.Tensor;
+import com.yahoo.tensor.TensorType;
+
+import java.nio.file.Path;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+class SentencePieceTester {
+
+    private final SentencePieceEncoder encoder;
+
+    public SentencePieceTester(Path model) {
+        this(new SentencePieceEncoder.Builder().addDefaultModel(model));
+    }
+
+    public SentencePieceTester(SentencePieceEncoder.Builder builder) {
+        this(builder.build());
+    }
+
+    public SentencePieceTester(SentencePieceEncoder encoder) {
+        this.encoder = encoder;
+    }
+
+    public void assertEncoded(String input, Integer... expectedCodes) {
+        assertArrayEquals(expectedCodes, encoder.encode(input, Language.UNKNOWN).toArray());
+    }
+
+    public void assertEncoded(String input, String tensorType, String tensor) {
+        TensorType type = TensorType.fromSpec(tensorType);
+        Tensor expected = Tensor.from(type, tensor);
+        assertEquals(expected, encoder.encode(input, Language.UNKNOWN, type));
+    }
+
+    public void assertSegmented(String input, String... expectedSegments) {
+        assertSegmented(Language.UNKNOWN, input, expectedSegments);
+    }
+
+    public void assertSegmented(Language language, String input, String... expectedSegments) {
+        assertArrayEquals(expectedSegments, encoder.segment(input, language).toArray());
+    }
+
+}
diff --git a/linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model b/linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model
new file mode 100644
index 00000000000..89f93ef3517
Binary files /dev/null and b/linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model differ
diff --git a/linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model b/linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model
new file mode 100644
index 00000000000..41c0688d9df
Binary files /dev/null and b/linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model differ
-- 
cgit v1.2.3