1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package ai.vespa.embedding;
import ai.vespa.modelintegration.evaluator.OnnxRuntime;
import com.yahoo.config.ModelReference;
import com.yahoo.embedding.BertBaseEmbedderConfig;
import com.yahoo.language.process.Embedder;
import com.yahoo.tensor.Tensor;
import com.yahoo.tensor.TensorType;
import org.junit.Test;
import java.util.List;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.junit.Assume.assumeTrue;
public class BertBaseEmbedderTest {
@Test
public void testEmbedder() {
String vocabPath = "src/test/models/onnx/transformer/dummy_vocab.txt";
String modelPath = "src/test/models/onnx/transformer/dummy_transformer.onnx";
assumeTrue(OnnxRuntime.isRuntimeAvailable(modelPath));
BertBaseEmbedderConfig.Builder builder = new BertBaseEmbedderConfig.Builder();
builder.tokenizerVocab(ModelReference.valueOf(vocabPath));
builder.transformerModel(ModelReference.valueOf(modelPath));
BertBaseEmbedder embedder = newBertBaseEmbedder(builder.build());
TensorType destType = TensorType.fromSpec("tensor<float>(x[7])");
List<Integer> tokens = List.of(1,2,3,4,5); // use random tokens instead of invoking the tokenizer
Tensor embedding = embedder.embedTokens(tokens, destType);
Tensor expected = Tensor.from("tensor<float>(x[7]):[-0.6178509, -0.8135831, 0.34416935, 0.3912577, -0.13068882, 2.5897025E-4, -0.18638384]");
assertEquals(embedding, expected);
}
@Test
public void testEmbedderWithoutTokenTypeIdsName() {
String vocabPath = "src/test/models/onnx/transformer/dummy_vocab.txt";
String modelPath = "src/test/models/onnx/transformer/dummy_transformer_without_type_ids.onnx";
assumeTrue(OnnxRuntime.isRuntimeAvailable(modelPath));
BertBaseEmbedderConfig.Builder builder = new BertBaseEmbedderConfig.Builder();
builder.tokenizerVocab(ModelReference.valueOf(vocabPath));
builder.transformerModel(ModelReference.valueOf(modelPath));
builder.transformerTokenTypeIds("");
BertBaseEmbedder embedder = newBertBaseEmbedder(builder.build());
TensorType destType = TensorType.fromSpec("tensor<float>(x[7])");
List<Integer> tokens = List.of(1,2,3,4,5); // use random tokens instead of invoking the tokenizer
Tensor embedding = embedder.embedTokens(tokens, destType);
Tensor expected = Tensor.from("tensor<float>(x[7]):[0.10873623, 0.56411576, 0.6044973, -0.4819714, 0.7519982, -0.83261716, 0.30430704]");
assertEquals(embedding, expected);
}
@Test
public void testEmbedderWithoutTokenTypeIdsNameButWithConfig() {
String vocabPath = "src/test/models/onnx/transformer/dummy_vocab.txt";
String modelPath = "src/test/models/onnx/transformer/dummy_transformer_without_type_ids.onnx";
assumeTrue(OnnxRuntime.isRuntimeAvailable(modelPath));
BertBaseEmbedderConfig.Builder builder = new BertBaseEmbedderConfig.Builder();
builder.tokenizerVocab(ModelReference.valueOf(vocabPath));
builder.transformerModel(ModelReference.valueOf(modelPath));
// we did not configured BertBaseEmbedder to accept missing token type ids
// so we expect ctor to throw
assertThrows(IllegalArgumentException.class, () -> { newBertBaseEmbedder(builder.build()); });
}
private static BertBaseEmbedder newBertBaseEmbedder(BertBaseEmbedderConfig cfg) {
return new BertBaseEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), cfg);
}
}
|