diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-09-25 14:50:33 +0000 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-09-25 14:50:33 +0000 |
commit | 1abb5adacbdbcfad7070243630164e4d31f68773 (patch) | |
tree | 069d42abf56db3a14bcff1f01df6291728daa32c /linguistics/src/test/java/com/yahoo/language | |
parent | 5ff3a5a4cbbd43180d821700f572eae73720fc17 (diff) |
Separate component from linguistics
Diffstat (limited to 'linguistics/src/test/java/com/yahoo/language')
3 files changed, 0 insertions, 197 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java deleted file mode 100644 index edbbe21ec53..00000000000 --- a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -package com.yahoo.language.sentencepiece; - -import com.yahoo.config.FileReference; -import com.yahoo.language.Language; -import org.junit.Test; - -/** - * @author bratseth - */ -public class SentencePieceConfigurationTest { - - @Test - public void testEnglishTokenization() { - var b = new SentencePieceConfig.Builder(); - addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); - tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence"); - tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo"); - } - - @Test - public void testNoCollapse() { - var b = new SentencePieceConfig.Builder(); - addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - b.collapseUnknowns(false); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); - tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); - } - - @Test - public void testHighestScore() { - var b = new SentencePieceConfig.Builder(); - addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - b.scoring(SentencePieceConfig.Scoring.highestScore); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); - tester.assertSegmented("hello", "▁h", "el", "lo"); - } - - @Test - public void testMultiLanguageTokenization() { - var b = new SentencePieceConfig.Builder(); - addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b); - addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); - tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト"); - tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo"); - tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o"); - } - - private void addModel(String language, String file, SentencePieceConfig.Builder b) { - var mb = new SentencePieceConfig.Model.Builder(); - mb.language(language); - mb.path(new FileReference(file)); - b.model(mb); - } - -} diff --git a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java b/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java deleted file mode 100644 index d60d7386d4b..00000000000 --- a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -package com.yahoo.language.sentencepiece; - -import com.yahoo.language.Language; -import org.junit.Test; - -import java.io.File; - -/** - * @author bratseth - */ -public class SentencePieceTest { - - @Test - public void testEnglishTokenization() { - var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertSegmented("h", "▁h"); - tester.assertSegmented("he", "▁he"); - tester.assertSegmented("hel", "▁hel"); - tester.assertSegmented("hello", "▁hel", "lo"); - tester.assertSegmented("hei", "▁he", "i"); - tester.assertSegmented("hei you", "▁he", "i", "▁you"); - tester.assertSegmented("hei you", "▁he", "i", "▁you"); - tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence"); - tester.assertSegmented("hello world!", "▁hel", "lo", "▁world", "!"); - tester.assertSegmented("Hello, world!", "▁", "H", "ello", ",", "▁world", "!"); - tester.assertSegmented("HELLO, world!", "▁", "HELLO", ",", "▁world", "!"); - tester.assertSegmented("KHJKJHHKJHHSH", "▁", "KHJKJHHKJHHSH"); - tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo"); - tester.assertSegmented(" hello ", "▁hel", "lo"); - tester.assertSegmented(")(/&#()/\"\")", "▁)", "(", "/", "&", "#", "(", ")", "/", "\"", "\")"); - tester.assertSegmented(")(/&#(small)/\"in quotes\")", "▁)", "(", "/", "&", "#", "(", "sm", "all", ")", "/", "\"", "in", "▁qu", "otes", "\")"); - tester.assertSegmented("x.400AS", "▁x", ".", "4", "00", "AS"); - tester.assertSegmented("A normal sentence. Yes one more.", "▁", "A", "▁normal", "▁sentence", ".", "▁", "Y", "es", "▁one", "▁more", "."); - } - - @Test - public void testIntegerListEncoding() { - var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello, world!", 908, 1418, 9934, 501, 9960); - tester.assertEncoded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960); - } - - @Test - public void testDenseTensorEncoding() { - var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]"); - tester.assertEncoded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]"); - tester.assertEncoded("hello, world!", "tensor(d[2])", "[908,1418]"); - } - - @Test - public void testSparseTensorEncoding() { - var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}"); - } - - @Test - public void testNoCollapse() { - var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() - .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) - .setCollapseUnknowns(false)); - tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); - } - - @Test - public void testHighestScore() { - var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() - .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) - .setScoring(Scoring.highestScore)); - tester.assertSegmented("h", "▁h"); - tester.assertSegmented("he", "▁he"); - tester.assertSegmented("hel", "▁h", "el"); - tester.assertSegmented("hello", "▁h", "el", "lo"); - } - - @Test - public void testMultiLanguageTokenization() { - SentencePieceEncoder.Builder builder = new SentencePieceEncoder.Builder(); - builder.addModel(Language.JAPANESE, new File("src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model").toPath()); - builder.addModel(Language.ENGLISH, new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - var tester = new SentencePieceTester(builder); - tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト"); - tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo"); - tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o"); - } - -} diff --git a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java deleted file mode 100644 index 1ba7c9b472d..00000000000 --- a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -// - -package com.yahoo.language.sentencepiece; - -import com.yahoo.language.Language; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorType; - -import java.nio.file.Path; - -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; - -class SentencePieceTester { - - private final SentencePieceEncoder encoder; - - public SentencePieceTester(Path model) { - this(new SentencePieceEncoder.Builder().addDefaultModel(model)); - } - - public SentencePieceTester(SentencePieceEncoder.Builder builder) { - this(builder.build()); - } - - public SentencePieceTester(SentencePieceEncoder encoder) { - this.encoder = encoder; - } - - public void assertEncoded(String input, Integer... expectedCodes) { - assertArrayEquals(expectedCodes, encoder.encode(input, Language.UNKNOWN).toArray()); - } - - public void assertEncoded(String input, String tensorType, String tensor) { - TensorType type = TensorType.fromSpec(tensorType); - Tensor expected = Tensor.from(type, tensor); - assertEquals(expected, encoder.encode(input, Language.UNKNOWN, type)); - } - - public void assertSegmented(String input, String... expectedSegments) { - assertSegmented(Language.UNKNOWN, input, expectedSegments); - } - - public void assertSegmented(Language language, String input, String... expectedSegments) { - assertArrayEquals(expectedSegments, encoder.segment(input, language).toArray()); - } - -} |