diff options
author | Jo Kristian Bergum <bergum@yahoo-inc.com> | 2021-09-14 21:28:52 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-09-14 21:28:52 +0200 |
commit | 8fcb35fcde19bad6b4e8527404a185f1e95b4f6d (patch) | |
tree | 6e45108c76d338a26a857754b8d80b2eaf933235 | |
parent | 4a1ab13c48130dc4917e6075b5734f57d15dbcda (diff) | |
parent | 61ee6220583b9809fa62a79661e3558a9b782d31 (diff) |
Merge pull request #19129 from vespa-engine/bratseth/sp-tests
More unit tests
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java | 21 |
1 files changed, 20 insertions, 1 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java b/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java index 70361f55750..7d0c1c5c78e 100644 --- a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java +++ b/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java @@ -17,7 +17,7 @@ import static org.junit.Assert.assertArrayEquals; public class SentencePieceTest { @Test - public void testEnglishTokenization() throws IOException { + public void testEnglishTokenization() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); tester.assertSegmented("h", "▁h"); tester.assertSegmented("he", "▁he"); @@ -42,6 +42,25 @@ public class SentencePieceTest { } @Test + public void testNoCollapse() { + var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) + .setCollapseUnknowns(false)); + tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); + } + + @Test + public void testHighestScore() { + var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) + .setScoring(SentencePieceEncoder.Scoring.highestScore)); + tester.assertSegmented("h", "▁h"); + tester.assertSegmented("he", "▁he"); + tester.assertSegmented("hel", "▁h", "el"); + tester.assertSegmented("hello", "▁h", "el", "lo"); + } + + @Test public void testJapaneseTokenization() throws IOException { SentencePieceEncoder.Builder builder = new SentencePieceEncoder.Builder(); builder.addModel(Language.JAPANESE, new File("src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model").toPath()); |