diff options
author | Jon Bratseth <bratseth@oath.com> | 2021-09-27 23:09:03 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-09-27 23:09:03 +0200 |
commit | 2df97d23d9f25ae60f010a2e9f273cb5b38e049b (patch) | |
tree | d2923a45682e91d80e7011c60cfb301e05acead3 /linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java | |
parent | 037f756caf4cfb99bcd988174839d7bc385267b9 (diff) | |
parent | 8f3fb1a105ded07144f6de527266a438e48a1766 (diff) |
Merge pull request #19294 from vespa-engine/bratseth/linguistics-componentsv7.473.17
Bratseth/linguistics components
Diffstat (limited to 'linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java')
-rw-r--r-- | linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java new file mode 100644 index 00000000000..edbbe21ec53 --- /dev/null +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java @@ -0,0 +1,59 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +package com.yahoo.language.sentencepiece; + +import com.yahoo.config.FileReference; +import com.yahoo.language.Language; +import org.junit.Test; + +/** + * @author bratseth + */ +public class SentencePieceConfigurationTest { + + @Test + public void testEnglishTokenization() { + var b = new SentencePieceConfig.Builder(); + addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); + var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence"); + tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo"); + } + + @Test + public void testNoCollapse() { + var b = new SentencePieceConfig.Builder(); + addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); + b.collapseUnknowns(false); + var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); + } + + @Test + public void testHighestScore() { + var b = new SentencePieceConfig.Builder(); + addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); + b.scoring(SentencePieceConfig.Scoring.highestScore); + var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + tester.assertSegmented("hello", "▁h", "el", "lo"); + } + + @Test + public void testMultiLanguageTokenization() { + var b = new SentencePieceConfig.Builder(); + addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b); + addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); + var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト"); + tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo"); + tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o"); + } + + private void addModel(String language, String file, SentencePieceConfig.Builder b) { + var mb = new SentencePieceConfig.Model.Builder(); + mb.language(language); + mb.path(new FileReference(file)); + b.model(mb); + } + +} |