- Add alternative sparsify implementation using generic tensor.reduce/map.

- Add options for specifying which one to use in tests and performance benchmark. Based on original implementation prior to custom reduce with the following improvements. - Apply Math.log after reduction which is the samp optimization as done in the custom implementation. - Join the 2 separate single dimension reduce statements into single 2 dimensional reduce.
author: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-31 17:14:24 +0100
committer: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-31 17:14:24 +0100
commit: 3640f1a724ae1e0be4d9c01b8071b001f1c45ab2 (patch)
tree: cd8e9840af55baba27f9695ee4be5a954d0c25d5 /model-integration
parent: ec2e5692646b7f6ea2289b9223054cdaf5061f0b (diff)
2 files changed, 52 insertions, 9 deletions
diff --git a/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java b/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java
index 28f8c4e252f..3a64083c623 100644
--- a/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java
+++ b/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java
@@ -14,6 +14,8 @@ import com.yahoo.tensor.DirectIndexedAddress;
 import com.yahoo.tensor.IndexedTensor;
 import com.yahoo.tensor.Tensor;
 import com.yahoo.tensor.TensorType;
+import com.yahoo.tensor.functions.Reduce;
+
 import java.nio.file.Paths;
 import java.util.List;
 import java.util.Map;
@@ -32,17 +34,22 @@ public class SpladeEmbedder extends AbstractComponent implements Embedder {
     private final String tokenTypeIdsName;
     private final String outputName;
     private final double termScoreThreshold;
+    private final boolean useCustomReduce;
     private final HuggingFaceTokenizer tokenizer;
     private final OnnxEvaluator evaluator;
 
     @Inject
     public SpladeEmbedder(OnnxRuntime onnx, Embedder.Runtime runtime, SpladeEmbedderConfig config) {
+        this(onnx, runtime, config, true);
+    }
+    SpladeEmbedder(OnnxRuntime onnx, Embedder.Runtime runtime, SpladeEmbedderConfig config, boolean useCustomReduce) {
         this.runtime = runtime;
         inputIdsName = config.transformerInputIds();
         attentionMaskName = config.transformerAttentionMask();
         outputName = config.transformerOutput();
         tokenTypeIdsName = config.transformerTokenTypeIds();
         termScoreThreshold = config.termScoreThreshold();
+        this.useCustomReduce = useCustomReduce;
 
         var tokenizerPath = Paths.get(config.tokenizerPath().toString());
         var builder = new HuggingFaceTokenizer.Builder()
@@ -117,20 +124,54 @@ public class SpladeEmbedder extends AbstractComponent implements Embedder {
         Map<String, Tensor> inputs = Map.of(inputIdsName, inputSequence.expand("d0"),
                 attentionMaskName, attentionMask.expand("d0"),
                 tokenTypeIdsName, tokenTypeIds.expand("d0"));
-        Tensor spladeTensor = sparsify((IndexedTensor) evaluator.evaluate(inputs).get(outputName), tensorType);
+        IndexedTensor output = (IndexedTensor) evaluator.evaluate(inputs).get(outputName);
+        Tensor spladeTensor = useCustomReduce
+                ? sparsifyCustomReduce(output, tensorType)
+                : sparsifyReduce(output, tensorType);
         runtime.sampleEmbeddingLatency((System.nanoTime() - start)/1_000_000d, context);
         return spladeTensor;
     }
 
 
     /**
-     * Sparsify the model output tensor.
+     * Sparsify the output tensor by applying a threshold on the log of the relu of the output.
+     * This uses generic tensor reduce+map, and is slightly slower than a custom unrolled variant.
+     * @param modelOutput the model output tensor of shape d1,dim where d1 is the sequence length and dim is size
+     *                of the vocabulary
+     * @param tensorType the type of the destination tensor
+     * @return A mapped tensor with the terms from the vocab that has a score above the threshold
+     */
+    private Tensor sparsifyReduce(Tensor modelOutput, TensorType tensorType) {
+        //Remove batch dim, batch size of 1
+        Tensor output = modelOutput.reduce(Reduce.Aggregator.max, "d0", "d1");
+        Tensor logOfRelu = output.map((x) -> Math.log(1 + (x > 0 ? x : 0)));
+        IndexedTensor vocab = (IndexedTensor) logOfRelu;
+        var builder = Tensor.Builder.of(tensorType);
+        long[] tokens = new long[1];
+        for (int i = 0; i < vocab.size(); i++) {
+            var score = vocab.get(i);
+            if (score > termScoreThreshold) {
+                tokens[0] = i;
+                String term = tokenizer.decode(tokens);
+                builder.cell().
+                        label(tensorType.dimensions().get(0).name(), term)
+                        .value(score);
+            }
+        }
+        return builder.build();
+    }
+
+
+
+    /**
+     * Sparsify the model output tensor.This uses an unrolled custom reduce and is 15-20% faster than the using
+     * generic tensor reduce.
      *
      * @param modelOutput the model output tensor of type tensorType
      * @param tensorType the type of the destination tensor
      * @return A mapped tensor with the terms from the vocab that has a score above the threshold
      */
-    public Tensor sparsify(IndexedTensor modelOutput, TensorType tensorType) {
+    public Tensor sparsifyCustomReduce(IndexedTensor modelOutput, TensorType tensorType) {
         var builder = Tensor.Builder.of(tensorType);
         long[] shape = modelOutput.shape();
         if(shape.length != 3) {
diff --git a/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java b/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java
index b48051814ab..e2b1caf4441 100644
--- a/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java
+++ b/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java
@@ -48,12 +48,12 @@ public class SpladeEmbedderTest {
     public void testPerformanceNotTerrible() {
         String text = "what was the manhattan project in this context it was a secret project to develop a nuclear weapon in world war" +
                 " ii the project was led by the united states with the support of the united kingdom and canada";
-        Long now = System.currentTimeMillis();
-        int n = 1000; // Takes around 7s on Intel core i9 2.4Ghz (macbook pro, 2019)
+        long now = System.currentTimeMillis();
+        int n = 1000; // 7s on Intel core i9 2.4Ghz (macbook pro, 2019) using custom reduce, 8s if using generic reduce
         for (int i = 0; i < n; i++) {
             assertEmbed("tensor<float>(t{})", text, indexingContext);
         }
-        Long elapsed = System.currentTimeMillis() - now;
+        long elapsed = System.currentTimeMillis() - now;
         System.out.println("Elapsed time: " + elapsed + " ms");
     }
 
@@ -72,9 +72,11 @@ public class SpladeEmbedderTest {
 
     static {
         indexingContext = new Embedder.Context("schema.indexing");
-        spladeEmbedder = getEmbedder();
+        // Custom reduce is 14% faster than generic reduce and the default.
+        // Keeping as option for performance testing
+        spladeEmbedder = getEmbedder(false);
     }
-    private static Embedder getEmbedder() {
+    private static Embedder getEmbedder(boolean useCustomReduce) {
         String vocabPath = "src/test/models/onnx/transformer/real_tokenizer.json";
         String modelPath = "src/test/models/onnx/transformer/dummy_transformer_mlm.onnx";
         assumeTrue(OnnxRuntime.isRuntimeAvailable(modelPath));
@@ -83,6 +85,6 @@ public class SpladeEmbedderTest {
         builder.transformerModel(ModelReference.valueOf(modelPath));
         builder.termScoreThreshold(scoreThreshold);
         builder.transformerGpuDevice(-1);
-        return  new SpladeEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), builder.build());
+        return  new SpladeEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), builder.build(), useCustomReduce);
     }
 }
author	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-31 17:14:24 +0100
committer	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-31 17:14:24 +0100
commit	3640f1a724ae1e0be4d9c01b8071b001f1c45ab2 (patch)
tree	cd8e9840af55baba27f9695ee4be5a954d0c25d5 /model-integration
parent	ec2e5692646b7f6ea2289b9223054cdaf5061f0b (diff)