aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics-components/src/test/java/com/yahoo/language
diff options
context:
space:
mode:
authorBjørn Christian Seime <bjorncs@yahooinc.com>2023-06-08 14:23:16 +0200
committerBjørn Christian Seime <bjorncs@yahooinc.com>2023-06-08 14:23:16 +0200
commitc0652d7794a90e0afb593fc1a3db17c99606a808 (patch)
tree17887acf2818107bbeb7355f5ee463f5fb02873d /linguistics-components/src/test/java/com/yahoo/language
parentc3d8c532e0f5b1db896d8693409098e8c2980da1 (diff)
Disable padding and make it configurable
Diffstat (limited to 'linguistics-components/src/test/java/com/yahoo/language')
-rw-r--r--linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java32
1 files changed, 23 insertions, 9 deletions
diff --git a/linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java b/linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java
index 6197fe214f1..8b34e1487be 100644
--- a/linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java
+++ b/linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java
@@ -27,7 +27,10 @@ class HuggingFaceTokenizerTest {
@Test
void bert_tokenizer() throws IOException {
- try (var tokenizer = createTokenizer(tmp, "bert-base-uncased")) {
+ try (var tokenizer = new HuggingFaceTokenizer.Builder()
+ .addSpecialTokens(false)
+ .addDefaultModel(decompressModelFile(tmp, "bert-base-uncased"))
+ .build()) {
var tester = new EmbedderTester(tokenizer);
tester.assertSegmented("what was the impact of the manhattan project",
"what", "was", "the", "impact", "of", "the", "manhattan", "project");
@@ -41,7 +44,10 @@ class HuggingFaceTokenizerTest {
@Test
void tokenizes_using_paraphrase_multilingual_mpnet_base_v2() throws IOException {
- try (var tokenizer = createTokenizer(tmp, "paraphrase-multilingual-mpnet-base-v2")) {
+ try (var tokenizer = new HuggingFaceTokenizer.Builder()
+ .addSpecialTokens(false)
+ .addDefaultModel(decompressModelFile(tmp, "paraphrase-multilingual-mpnet-base-v2"))
+ .build()) {
var tester = new EmbedderTester(tokenizer);
tester.assertSegmented("h", "▁h");
tester.assertSegmented("he", "▁he");
@@ -87,6 +93,21 @@ class HuggingFaceTokenizerTest {
}
}
+ @Test
+ void disables_padding_by_default() throws IOException {
+ var builder = new HuggingFaceTokenizer.Builder()
+ .addDefaultModel(decompressModelFile(tmp, "bert-base-uncased"))
+ .addSpecialTokens(true).setMaxLength(16);
+ String input = "what was the impact of the manhattan project";
+ try (var tokenizerWithDefaultPadding = builder.build();
+ var tokenizerWithPaddingDisabled = builder.setPadding(false).build();
+ var tokenizerWithPaddingEnabled = builder.setPadding(true).build()) {
+ assertMaxLengthRespected(10, tokenizerWithDefaultPadding.encode(input));
+ assertMaxLengthRespected(10, tokenizerWithPaddingDisabled.encode(input));
+ assertMaxLengthRespected(16, tokenizerWithPaddingEnabled.encode(input));
+ }
+ }
+
private static void assertMaxLengthRespected(int maxLength, Encoding encoding) {
assertEquals(maxLength, encoding.ids().size());
assertEquals(maxLength, encoding.tokens().size());
@@ -94,13 +115,6 @@ class HuggingFaceTokenizerTest {
assertEquals(maxLength, encoding.typeIds().size());
}
- private static HuggingFaceTokenizer createTokenizer(Path tmp, String model) throws IOException {
- return new HuggingFaceTokenizer.Builder()
- .addSpecialTokens(false)
- .addDefaultModel(decompressModelFile(tmp, model))
- .build();
- }
-
private static Path decompressModelFile(Path tmp, String model) throws IOException {
var source = Paths.get("src/test/models/huggingface/%s.json.gz".formatted(model));
Path destination = tmp.resolve(source.getFileName().toString().replace(".gz", ""));