summaryrefslogtreecommitdiffstats
path: root/linguistics-components
diff options
context:
space:
mode:
authorBjørn Christian Seime <bjorncs@yahooinc.com>2023-06-08 15:13:06 +0200
committerBjørn Christian Seime <bjorncs@yahooinc.com>2023-06-08 15:16:38 +0200
commit7ae3e26016c32175aaf437ac09fca48457eb338e (patch)
tree8b872c0f31c38ee490ee33f120df4a7380e9ec0a /linguistics-components
parent53e8203706e41e07875f37d0343b9e97a33b12f6 (diff)
Test padding with truncation
Diffstat (limited to 'linguistics-components')
-rw-r--r--linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java2
-rw-r--r--linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java5
2 files changed, 4 insertions, 3 deletions
diff --git a/linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java b/linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java
index a0b41669b2b..1f1757e6ade 100644
--- a/linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java
+++ b/linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java
@@ -46,7 +46,7 @@ public class HuggingFaceTokenizer extends AbstractComponent implements Embedder,
.optAddSpecialTokens(b.addSpecialTokens != null ? b.addSpecialTokens : true)
.optTruncation(b.truncation != null ? b.truncation : true)
.optMaxLength(b.maxLength != null ? b.maxLength : 512);
- if (b.padding != null && b.padding) hfb.optPadToMaxLength();
+ if (b.padding != null && b.padding) hfb.optPadToMaxLength(); else hfb.optPadding(false);
return hfb.build();
}));
});
diff --git a/linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java b/linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java
index 67f94800c39..bf2e0f82829 100644
--- a/linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java
+++ b/linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java
@@ -101,15 +101,16 @@ class HuggingFaceTokenizerTest {
@Test
void disables_padding_by_default() throws IOException {
var builder = new HuggingFaceTokenizer.Builder()
+ .setTruncation(true)
.addDefaultModel(decompressModelFile(tmp, "bert-base-uncased"))
- .addSpecialTokens(true).setMaxLength(16);
+ .addSpecialTokens(true).setMaxLength(32);
String input = "what was the impact of the manhattan project";
try (var tokenizerWithDefaultPadding = builder.build();
var tokenizerWithPaddingDisabled = builder.setPadding(false).build();
var tokenizerWithPaddingEnabled = builder.setPadding(true).build()) {
assertMaxLengthRespected(10, tokenizerWithDefaultPadding.encode(input));
assertMaxLengthRespected(10, tokenizerWithPaddingDisabled.encode(input));
- assertMaxLengthRespected(16, tokenizerWithPaddingEnabled.encode(input));
+ assertMaxLengthRespected(32, tokenizerWithPaddingEnabled.encode(input));
}
}