diff options
Diffstat (limited to 'linguistics-components/src/test/java')
3 files changed, 23 insertions, 23 deletions
diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java index edbbe21ec53..1ed2271f774 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java @@ -15,7 +15,7 @@ public class SentencePieceConfigurationTest { public void testEnglishTokenization() { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence"); tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo"); } @@ -25,7 +25,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); b.collapseUnknowns(false); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); } @@ -34,7 +34,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); b.scoring(SentencePieceConfig.Scoring.highestScore); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("hello", "▁h", "el", "lo"); } @@ -43,7 +43,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b); addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト"); tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo"); tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o"); diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java index d60d7386d4b..939f8ebe9d3 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java @@ -38,27 +38,27 @@ public class SentencePieceTest { @Test public void testIntegerListEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello, world!", 908, 1418, 9934, 501, 9960); - tester.assertEncoded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960); + tester.assertEmbedded("hello, world!", 908, 1418, 9934, 501, 9960); + tester.assertEmbedded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960); } @Test public void testDenseTensorEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]"); - tester.assertEncoded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]"); - tester.assertEncoded("hello, world!", "tensor(d[2])", "[908,1418]"); + tester.assertEmbedded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]"); + tester.assertEmbedded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]"); + tester.assertEmbedded("hello, world!", "tensor(d[2])", "[908,1418]"); } @Test public void testSparseTensorEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}"); + tester.assertEmbedded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}"); } @Test public void testNoCollapse() { - var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + var tester = new SentencePieceTester(new SentencePieceEmbedder.Builder() .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) .setCollapseUnknowns(false)); tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); @@ -66,7 +66,7 @@ public class SentencePieceTest { @Test public void testHighestScore() { - var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + var tester = new SentencePieceTester(new SentencePieceEmbedder.Builder() .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) .setScoring(Scoring.highestScore)); tester.assertSegmented("h", "▁h"); @@ -77,7 +77,7 @@ public class SentencePieceTest { @Test public void testMultiLanguageTokenization() { - SentencePieceEncoder.Builder builder = new SentencePieceEncoder.Builder(); + SentencePieceEmbedder.Builder builder = new SentencePieceEmbedder.Builder(); builder.addModel(Language.JAPANESE, new File("src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model").toPath()); builder.addModel(Language.ENGLISH, new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); var tester = new SentencePieceTester(builder); diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java index 1ba7c9b472d..c4cb13a3d23 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java @@ -14,28 +14,28 @@ import static org.junit.Assert.assertEquals; class SentencePieceTester { - private final SentencePieceEncoder encoder; + private final SentencePieceEmbedder embedder; public SentencePieceTester(Path model) { - this(new SentencePieceEncoder.Builder().addDefaultModel(model)); + this(new SentencePieceEmbedder.Builder().addDefaultModel(model)); } - public SentencePieceTester(SentencePieceEncoder.Builder builder) { + public SentencePieceTester(SentencePieceEmbedder.Builder builder) { this(builder.build()); } - public SentencePieceTester(SentencePieceEncoder encoder) { - this.encoder = encoder; + public SentencePieceTester(SentencePieceEmbedder embedder) { + this.embedder = embedder; } - public void assertEncoded(String input, Integer... expectedCodes) { - assertArrayEquals(expectedCodes, encoder.encode(input, Language.UNKNOWN).toArray()); + public void assertEmbedded(String input, Integer... expectedCodes) { + assertArrayEquals(expectedCodes, embedder.embed(input, Language.UNKNOWN).toArray()); } - public void assertEncoded(String input, String tensorType, String tensor) { + public void assertEmbedded(String input, String tensorType, String tensor) { TensorType type = TensorType.fromSpec(tensorType); Tensor expected = Tensor.from(type, tensor); - assertEquals(expected, encoder.encode(input, Language.UNKNOWN, type)); + assertEquals(expected, embedder.embed(input, Language.UNKNOWN, type)); } public void assertSegmented(String input, String... expectedSegments) { @@ -43,7 +43,7 @@ class SentencePieceTester { } public void assertSegmented(Language language, String input, String... expectedSegments) { - assertArrayEquals(expectedSegments, encoder.segment(input, language).toArray()); + assertArrayEquals(expectedSegments, embedder.segment(input, language).toArray()); } } |