diff options
3 files changed, 140 insertions, 5 deletions
diff --git a/model-integration/src/main/java/ai/vespa/embedding/huggingface/HuggingFaceEmbedder.java b/model-integration/src/main/java/ai/vespa/embedding/huggingface/HuggingFaceEmbedder.java index 35645deffa4..9a0f5cdb294 100644 --- a/model-integration/src/main/java/ai/vespa/embedding/huggingface/HuggingFaceEmbedder.java +++ b/model-integration/src/main/java/ai/vespa/embedding/huggingface/HuggingFaceEmbedder.java @@ -15,8 +15,10 @@ import com.yahoo.tensor.IndexedTensor; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorAddress; import com.yahoo.tensor.TensorType; +import com.yahoo.vespa.config.search.core.ProtonConfig; import java.nio.file.Paths; +import java.util.BitSet; import java.util.List; import java.util.Map; import java.util.logging.Logger; @@ -124,18 +126,44 @@ public class HuggingFaceEmbedder extends AbstractComponent implements Embedder { } Map<String, Tensor> outputs = evaluator.evaluate(inputs); - Tensor tokenEmbeddings = outputs.get(outputName); - var result = poolingStrategy.toSentenceEmbedding(tensorType, tokenEmbeddings, attentionMask); - var normalized = normalize ? normalize(result, tensorType) : result; + IndexedTensor tokenEmbeddings = (IndexedTensor) outputs.get(outputName); + long[] resultShape = tokenEmbeddings.shape(); + //shape batch, sequence, embedding dimensionality + if (resultShape.length != 3) { + throw new IllegalArgumentException("" + + "Expected 3 output dimensions for output name '" + + outputName + "': [batch, sequence, embedding], got " + resultShape.length); + } + Tensor result; + if (tensorType.valueType() == TensorType.Value.INT8) { + long outputDimensions = resultShape[2]; + long targetDim = tensorType.dimensions().get(0).size().get(); + + if(targetDim * 8 > outputDimensions) { + throw new IllegalArgumentException("Cannot pack " + outputDimensions + " into " + targetDim + " int8s"); + } + //Dimensionality flexibility 🪆 - packing only the first 8*targetDim values from the model output + long firstDimensions = 8 * targetDim; + String name = tensorType.indexedSubtype().dimensions().get(0).name(); + //perform pooling and normalizing using floating point embeddings before binarizing + //using the firstDimensions as the target dimensionality + TensorType poolingType = new TensorType.Builder(TensorType.Value.FLOAT).indexed(name, firstDimensions).build(); + result = poolingStrategy.toSentenceEmbedding(poolingType, tokenEmbeddings, attentionMask); + result = normalize? normalize(result, poolingType) : result; + result = binarize((IndexedTensor) result, tensorType); + + } else { // regular floating points embeddings + result = poolingStrategy.toSentenceEmbedding(tensorType, tokenEmbeddings, attentionMask); + result = normalize ? normalize(result, tensorType) : result; + } runtime.sampleEmbeddingLatency((System.nanoTime() - start)/1_000_000d, context); - return normalized; + return result; } Tensor normalize(Tensor embedding, TensorType tensorType) { double sumOfSquares = 0.0; Tensor.Builder builder = Tensor.Builder.of(tensorType); - for (int i = 0; i < tensorType.dimensions().get(0).size().get(); i++) { double item = embedding.get(TensorAddress.of(i)); sumOfSquares += item * item; @@ -151,6 +179,29 @@ public class HuggingFaceEmbedder extends AbstractComponent implements Embedder { return builder.build(); } + Tensor binarize(IndexedTensor embedding, TensorType tensorType) { + Tensor.Builder builder = Tensor.Builder.of(tensorType); + BitSet bitSet = new BitSet(8); + int index = 0; + for (int d = 0; d < embedding.sizeAsInt(); d++) { + var value = embedding.get(d); + int bitIndex = 7 - (d % 8); + if (value > 0.0) { + bitSet.set(bitIndex); + } else { + bitSet.clear(bitIndex); + } + if ((d + 1) % 8 == 0) { + byte[] bytes = bitSet.toByteArray(); + byte packed = (bytes.length == 0) ? 0 : bytes[0]; + builder.cell(TensorAddress.of(index), packed); + index++; + bitSet = new BitSet(8); + } + } + return builder.build(); + } + private IndexedTensor createTensorRepresentation(List<Long> input, String dimension) { int size = input.size(); TensorType type = new TensorType.Builder(TensorType.Value.FLOAT).indexed(dimension, size).build(); diff --git a/model-integration/src/test/java/ai/vespa/embedding/HuggingFaceEmbedderTest.java b/model-integration/src/test/java/ai/vespa/embedding/HuggingFaceEmbedderTest.java new file mode 100644 index 00000000000..cb3ebe11154 --- /dev/null +++ b/model-integration/src/test/java/ai/vespa/embedding/HuggingFaceEmbedderTest.java @@ -0,0 +1,84 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package ai.vespa.embedding; + +import ai.vespa.embedding.huggingface.HuggingFaceEmbedder; +import ai.vespa.modelintegration.evaluator.OnnxRuntime; +import com.yahoo.config.ModelReference; +import com.yahoo.embedding.huggingface.HuggingFaceEmbedderConfig; +import com.yahoo.language.process.Embedder; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorType; +import org.junit.Test; + +import static org.junit.Assert.assertThrows; +import static org.junit.Assume.assumeTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class HuggingFaceEmbedderTest { + + static HuggingFaceEmbedder embedder = getEmbedder(); + static HuggingFaceEmbedder normalizedEmbedder = getNormalizedEmbedder(); + static Embedder.Context context = new Embedder.Context("schema.indexing"); + + + @Test + public void testEmbedder() { + String input = "This is a test"; + Tensor result = embedder.embed(input, context, TensorType.fromSpec(("tensor<float>(x[8])"))); + assertEquals("tensor<float>(x[8]):[-0.666825, 0.33570012, 0.22756238, 0.0919357, -0.06958359, 0.32301554, 0.42277765, 0.27041236]", result.toAbbreviatedString()); + // Thresholding on the above gives [0, 1, 1, 1, 0, 1, 1, 1] which is packed as int8 119 + Tensor binarizedResult = embedder.embed(input, context, TensorType.fromSpec(("tensor<int8>(x[1])"))); + assertEquals("tensor<int8>(x[1]):[119]", binarizedResult.toString()); + + binarizedResult = embedder.embed(input, context, TensorType.fromSpec(("tensor<int8>(x[2])"))); + assertEquals("tensor<int8>(x[2]):[119, 44]", binarizedResult.toAbbreviatedString()); + + binarizedResult = embedder.embed(input, context, TensorType.fromSpec(("tensor<int8>(x[48])"))); + assertTrue(binarizedResult.toAbbreviatedString().startsWith("tensor<int8>(x[48]):[119, 44")); + + assertThrows(IllegalArgumentException.class, () -> { + // throws because the target tensor type is not compatible with the model output + //49*8 > 384 + embedder.embed(input, context, TensorType.fromSpec(("tensor<int8>(x[49])"))); + }); + + Tensor float16Result = embedder.embed(input, context, TensorType.fromSpec(("tensor<bfloat16>(x[1])"))); + assertEquals("tensor<bfloat16>(x[1]):[-0.666825]", float16Result.toAbbreviatedString()); + } + + @Test + public void testEmbedderWithNormalization() { + String input = "This is a test"; + + Tensor result = normalizedEmbedder.embed(input, context, TensorType.fromSpec(("tensor<float>(x[8])"))); + assertEquals(1.0, result.multiply(result).sum().asDouble(), 1e-4); + + result = normalizedEmbedder.embed(input, context, TensorType.fromSpec(("tensor<float>(x[16])"))); + assertEquals(1.0, result.multiply(result).sum().asDouble(), 1e-4); + Tensor binarizedResult = embedder.embed(input, context, TensorType.fromSpec(("tensor<int8>(x[2])"))); + assertEquals("tensor<int8>(x[2]):[119, 44]", binarizedResult.toAbbreviatedString()); + } + + private static HuggingFaceEmbedder getEmbedder() { + String vocabPath = "src/test/models/onnx/transformer/real_tokenizer.json"; + String modelPath = "src/test/models/onnx/transformer/embedding_model.onnx"; + assumeTrue(OnnxRuntime.isRuntimeAvailable(modelPath)); + HuggingFaceEmbedderConfig.Builder builder = new HuggingFaceEmbedderConfig.Builder(); + builder.tokenizerPath(ModelReference.valueOf(vocabPath)); + builder.transformerModel(ModelReference.valueOf(modelPath)); + builder.transformerGpuDevice(-1); + return new HuggingFaceEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), builder.build()); + } + private static HuggingFaceEmbedder getNormalizedEmbedder() { + String vocabPath = "src/test/models/onnx/transformer/real_tokenizer.json"; + String modelPath = "src/test/models/onnx/transformer/embedding_model.onnx"; + assumeTrue(OnnxRuntime.isRuntimeAvailable(modelPath)); + HuggingFaceEmbedderConfig.Builder builder = new HuggingFaceEmbedderConfig.Builder(); + builder.tokenizerPath(ModelReference.valueOf(vocabPath)); + builder.transformerModel(ModelReference.valueOf(modelPath)); + builder.transformerGpuDevice(-1); + builder.normalize(true); + return new HuggingFaceEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), builder.build()); + } +} diff --git a/model-integration/src/test/models/onnx/transformer/embedding_model.onnx b/model-integration/src/test/models/onnx/transformer/embedding_model.onnx Binary files differnew file mode 100644 index 00000000000..266ed567344 --- /dev/null +++ b/model-integration/src/test/models/onnx/transformer/embedding_model.onnx |