From 1abb5adacbdbcfad7070243630164e4d31f68773 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Sat, 25 Sep 2021 14:50:33 +0000 Subject: Separate component from linguistics --- .../SentencePieceConfigurationTest.java | 59 ++++++++++++++ .../language/sentencepiece/SentencePieceTest.java | 89 +++++++++++++++++++++ .../sentencepiece/SentencePieceTester.java | 49 ++++++++++++ .../models/sentencepiece/en.wiki.bpe.vs10000.model | Bin 0 -> 400869 bytes .../models/sentencepiece/ja.wiki.bpe.vs5000.model | Bin 0 -> 300865 bytes 5 files changed, 197 insertions(+) create mode 100644 linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java create mode 100644 linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java create mode 100644 linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java create mode 100644 linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model create mode 100644 linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model (limited to 'linguistics-components/src/test') diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java new file mode 100644 index 00000000000..edbbe21ec53 --- /dev/null +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java @@ -0,0 +1,59 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +package com.yahoo.language.sentencepiece; + +import com.yahoo.config.FileReference; +import com.yahoo.language.Language; +import org.junit.Test; + +/** + * @author bratseth + */ +public class SentencePieceConfigurationTest { + + @Test + public void testEnglishTokenization() { + var b = new SentencePieceConfig.Builder(); + addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); + var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence"); + tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo"); + } + + @Test + public void testNoCollapse() { + var b = new SentencePieceConfig.Builder(); + addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); + b.collapseUnknowns(false); + var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); + } + + @Test + public void testHighestScore() { + var b = new SentencePieceConfig.Builder(); + addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); + b.scoring(SentencePieceConfig.Scoring.highestScore); + var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + tester.assertSegmented("hello", "▁h", "el", "lo"); + } + + @Test + public void testMultiLanguageTokenization() { + var b = new SentencePieceConfig.Builder(); + addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b); + addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); + var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト"); + tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo"); + tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o"); + } + + private void addModel(String language, String file, SentencePieceConfig.Builder b) { + var mb = new SentencePieceConfig.Model.Builder(); + mb.language(language); + mb.path(new FileReference(file)); + b.model(mb); + } + +} diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java new file mode 100644 index 00000000000..d60d7386d4b --- /dev/null +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java @@ -0,0 +1,89 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +package com.yahoo.language.sentencepiece; + +import com.yahoo.language.Language; +import org.junit.Test; + +import java.io.File; + +/** + * @author bratseth + */ +public class SentencePieceTest { + + @Test + public void testEnglishTokenization() { + var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); + tester.assertSegmented("h", "▁h"); + tester.assertSegmented("he", "▁he"); + tester.assertSegmented("hel", "▁hel"); + tester.assertSegmented("hello", "▁hel", "lo"); + tester.assertSegmented("hei", "▁he", "i"); + tester.assertSegmented("hei you", "▁he", "i", "▁you"); + tester.assertSegmented("hei you", "▁he", "i", "▁you"); + tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence"); + tester.assertSegmented("hello world!", "▁hel", "lo", "▁world", "!"); + tester.assertSegmented("Hello, world!", "▁", "H", "ello", ",", "▁world", "!"); + tester.assertSegmented("HELLO, world!", "▁", "HELLO", ",", "▁world", "!"); + tester.assertSegmented("KHJKJHHKJHHSH", "▁", "KHJKJHHKJHHSH"); + tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo"); + tester.assertSegmented(" hello ", "▁hel", "lo"); + tester.assertSegmented(")(/&#()/\"\")", "▁)", "(", "/", "&", "#", "(", ")", "/", "\"", "\")"); + tester.assertSegmented(")(/&#(small)/\"in quotes\")", "▁)", "(", "/", "&", "#", "(", "sm", "all", ")", "/", "\"", "in", "▁qu", "otes", "\")"); + tester.assertSegmented("x.400AS", "▁x", ".", "4", "00", "AS"); + tester.assertSegmented("A normal sentence. Yes one more.", "▁", "A", "▁normal", "▁sentence", ".", "▁", "Y", "es", "▁one", "▁more", "."); + } + + @Test + public void testIntegerListEncoding() { + var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); + tester.assertEncoded("hello, world!", 908, 1418, 9934, 501, 9960); + tester.assertEncoded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960); + } + + @Test + public void testDenseTensorEncoding() { + var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); + tester.assertEncoded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]"); + tester.assertEncoded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]"); + tester.assertEncoded("hello, world!", "tensor(d[2])", "[908,1418]"); + } + + @Test + public void testSparseTensorEncoding() { + var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); + tester.assertEncoded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}"); + } + + @Test + public void testNoCollapse() { + var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) + .setCollapseUnknowns(false)); + tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); + } + + @Test + public void testHighestScore() { + var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) + .setScoring(Scoring.highestScore)); + tester.assertSegmented("h", "▁h"); + tester.assertSegmented("he", "▁he"); + tester.assertSegmented("hel", "▁h", "el"); + tester.assertSegmented("hello", "▁h", "el", "lo"); + } + + @Test + public void testMultiLanguageTokenization() { + SentencePieceEncoder.Builder builder = new SentencePieceEncoder.Builder(); + builder.addModel(Language.JAPANESE, new File("src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model").toPath()); + builder.addModel(Language.ENGLISH, new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); + var tester = new SentencePieceTester(builder); + tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト"); + tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo"); + tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o"); + } + +} diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java new file mode 100644 index 00000000000..1ba7c9b472d --- /dev/null +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java @@ -0,0 +1,49 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// + +package com.yahoo.language.sentencepiece; + +import com.yahoo.language.Language; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorType; + +import java.nio.file.Path; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +class SentencePieceTester { + + private final SentencePieceEncoder encoder; + + public SentencePieceTester(Path model) { + this(new SentencePieceEncoder.Builder().addDefaultModel(model)); + } + + public SentencePieceTester(SentencePieceEncoder.Builder builder) { + this(builder.build()); + } + + public SentencePieceTester(SentencePieceEncoder encoder) { + this.encoder = encoder; + } + + public void assertEncoded(String input, Integer... expectedCodes) { + assertArrayEquals(expectedCodes, encoder.encode(input, Language.UNKNOWN).toArray()); + } + + public void assertEncoded(String input, String tensorType, String tensor) { + TensorType type = TensorType.fromSpec(tensorType); + Tensor expected = Tensor.from(type, tensor); + assertEquals(expected, encoder.encode(input, Language.UNKNOWN, type)); + } + + public void assertSegmented(String input, String... expectedSegments) { + assertSegmented(Language.UNKNOWN, input, expectedSegments); + } + + public void assertSegmented(Language language, String input, String... expectedSegments) { + assertArrayEquals(expectedSegments, encoder.segment(input, language).toArray()); + } + +} diff --git a/linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model b/linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model new file mode 100644 index 00000000000..89f93ef3517 Binary files /dev/null and b/linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model differ diff --git a/linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model b/linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model new file mode 100644 index 00000000000..41c0688d9df Binary files /dev/null and b/linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model differ -- cgit v1.2.3