diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-31 17:14:24 +0100 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-31 17:14:24 +0100 |
commit | 3640f1a724ae1e0be4d9c01b8071b001f1c45ab2 (patch) | |
tree | cd8e9840af55baba27f9695ee4be5a954d0c25d5 /model-integration | |
parent | ec2e5692646b7f6ea2289b9223054cdaf5061f0b (diff) |
- Add alternative sparsify implementation using generic tensor.reduce/map.
- Add options for specifying which one to use in tests and performance benchmark.
Based on original implementation prior to custom reduce with the following improvements.
- Apply Math.log after reduction which is the samp optimization as done in the custom implementation.
- Join the 2 separate single dimension reduce statements into single 2 dimensional reduce.
Diffstat (limited to 'model-integration')
-rw-r--r-- | model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java | 47 | ||||
-rw-r--r-- | model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java | 14 |
2 files changed, 52 insertions, 9 deletions
diff --git a/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java b/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java index 28f8c4e252f..3a64083c623 100644 --- a/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java +++ b/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java @@ -14,6 +14,8 @@ import com.yahoo.tensor.DirectIndexedAddress; import com.yahoo.tensor.IndexedTensor; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; +import com.yahoo.tensor.functions.Reduce; + import java.nio.file.Paths; import java.util.List; import java.util.Map; @@ -32,17 +34,22 @@ public class SpladeEmbedder extends AbstractComponent implements Embedder { private final String tokenTypeIdsName; private final String outputName; private final double termScoreThreshold; + private final boolean useCustomReduce; private final HuggingFaceTokenizer tokenizer; private final OnnxEvaluator evaluator; @Inject public SpladeEmbedder(OnnxRuntime onnx, Embedder.Runtime runtime, SpladeEmbedderConfig config) { + this(onnx, runtime, config, true); + } + SpladeEmbedder(OnnxRuntime onnx, Embedder.Runtime runtime, SpladeEmbedderConfig config, boolean useCustomReduce) { this.runtime = runtime; inputIdsName = config.transformerInputIds(); attentionMaskName = config.transformerAttentionMask(); outputName = config.transformerOutput(); tokenTypeIdsName = config.transformerTokenTypeIds(); termScoreThreshold = config.termScoreThreshold(); + this.useCustomReduce = useCustomReduce; var tokenizerPath = Paths.get(config.tokenizerPath().toString()); var builder = new HuggingFaceTokenizer.Builder() @@ -117,20 +124,54 @@ public class SpladeEmbedder extends AbstractComponent implements Embedder { Map<String, Tensor> inputs = Map.of(inputIdsName, inputSequence.expand("d0"), attentionMaskName, attentionMask.expand("d0"), tokenTypeIdsName, tokenTypeIds.expand("d0")); - Tensor spladeTensor = sparsify((IndexedTensor) evaluator.evaluate(inputs).get(outputName), tensorType); + IndexedTensor output = (IndexedTensor) evaluator.evaluate(inputs).get(outputName); + Tensor spladeTensor = useCustomReduce + ? sparsifyCustomReduce(output, tensorType) + : sparsifyReduce(output, tensorType); runtime.sampleEmbeddingLatency((System.nanoTime() - start)/1_000_000d, context); return spladeTensor; } /** - * Sparsify the model output tensor. + * Sparsify the output tensor by applying a threshold on the log of the relu of the output. + * This uses generic tensor reduce+map, and is slightly slower than a custom unrolled variant. + * @param modelOutput the model output tensor of shape d1,dim where d1 is the sequence length and dim is size + * of the vocabulary + * @param tensorType the type of the destination tensor + * @return A mapped tensor with the terms from the vocab that has a score above the threshold + */ + private Tensor sparsifyReduce(Tensor modelOutput, TensorType tensorType) { + //Remove batch dim, batch size of 1 + Tensor output = modelOutput.reduce(Reduce.Aggregator.max, "d0", "d1"); + Tensor logOfRelu = output.map((x) -> Math.log(1 + (x > 0 ? x : 0))); + IndexedTensor vocab = (IndexedTensor) logOfRelu; + var builder = Tensor.Builder.of(tensorType); + long[] tokens = new long[1]; + for (int i = 0; i < vocab.size(); i++) { + var score = vocab.get(i); + if (score > termScoreThreshold) { + tokens[0] = i; + String term = tokenizer.decode(tokens); + builder.cell(). + label(tensorType.dimensions().get(0).name(), term) + .value(score); + } + } + return builder.build(); + } + + + + /** + * Sparsify the model output tensor.This uses an unrolled custom reduce and is 15-20% faster than the using + * generic tensor reduce. * * @param modelOutput the model output tensor of type tensorType * @param tensorType the type of the destination tensor * @return A mapped tensor with the terms from the vocab that has a score above the threshold */ - public Tensor sparsify(IndexedTensor modelOutput, TensorType tensorType) { + public Tensor sparsifyCustomReduce(IndexedTensor modelOutput, TensorType tensorType) { var builder = Tensor.Builder.of(tensorType); long[] shape = modelOutput.shape(); if(shape.length != 3) { diff --git a/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java b/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java index b48051814ab..e2b1caf4441 100644 --- a/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java +++ b/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java @@ -48,12 +48,12 @@ public class SpladeEmbedderTest { public void testPerformanceNotTerrible() { String text = "what was the manhattan project in this context it was a secret project to develop a nuclear weapon in world war" + " ii the project was led by the united states with the support of the united kingdom and canada"; - Long now = System.currentTimeMillis(); - int n = 1000; // Takes around 7s on Intel core i9 2.4Ghz (macbook pro, 2019) + long now = System.currentTimeMillis(); + int n = 1000; // 7s on Intel core i9 2.4Ghz (macbook pro, 2019) using custom reduce, 8s if using generic reduce for (int i = 0; i < n; i++) { assertEmbed("tensor<float>(t{})", text, indexingContext); } - Long elapsed = System.currentTimeMillis() - now; + long elapsed = System.currentTimeMillis() - now; System.out.println("Elapsed time: " + elapsed + " ms"); } @@ -72,9 +72,11 @@ public class SpladeEmbedderTest { static { indexingContext = new Embedder.Context("schema.indexing"); - spladeEmbedder = getEmbedder(); + // Custom reduce is 14% faster than generic reduce and the default. + // Keeping as option for performance testing + spladeEmbedder = getEmbedder(false); } - private static Embedder getEmbedder() { + private static Embedder getEmbedder(boolean useCustomReduce) { String vocabPath = "src/test/models/onnx/transformer/real_tokenizer.json"; String modelPath = "src/test/models/onnx/transformer/dummy_transformer_mlm.onnx"; assumeTrue(OnnxRuntime.isRuntimeAvailable(modelPath)); @@ -83,6 +85,6 @@ public class SpladeEmbedderTest { builder.transformerModel(ModelReference.valueOf(modelPath)); builder.termScoreThreshold(scoreThreshold); builder.transformerGpuDevice(-1); - return new SpladeEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), builder.build()); + return new SpladeEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), builder.build(), useCustomReduce); } } |