summaryrefslogtreecommitdiffstats
path: root/model-integration
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-01-31 17:14:24 +0100
committerHenning Baldersheim <balder@yahoo-inc.com>2024-01-31 17:14:24 +0100
commit3640f1a724ae1e0be4d9c01b8071b001f1c45ab2 (patch)
treecd8e9840af55baba27f9695ee4be5a954d0c25d5 /model-integration
parentec2e5692646b7f6ea2289b9223054cdaf5061f0b (diff)
- Add alternative sparsify implementation using generic tensor.reduce/map.
- Add options for specifying which one to use in tests and performance benchmark. Based on original implementation prior to custom reduce with the following improvements. - Apply Math.log after reduction which is the samp optimization as done in the custom implementation. - Join the 2 separate single dimension reduce statements into single 2 dimensional reduce.
Diffstat (limited to 'model-integration')
-rw-r--r--model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java47
-rw-r--r--model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java14
2 files changed, 52 insertions, 9 deletions
diff --git a/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java b/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java
index 28f8c4e252f..3a64083c623 100644
--- a/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java
+++ b/model-integration/src/main/java/ai/vespa/embedding/SpladeEmbedder.java
@@ -14,6 +14,8 @@ import com.yahoo.tensor.DirectIndexedAddress;
import com.yahoo.tensor.IndexedTensor;
import com.yahoo.tensor.Tensor;
import com.yahoo.tensor.TensorType;
+import com.yahoo.tensor.functions.Reduce;
+
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
@@ -32,17 +34,22 @@ public class SpladeEmbedder extends AbstractComponent implements Embedder {
private final String tokenTypeIdsName;
private final String outputName;
private final double termScoreThreshold;
+ private final boolean useCustomReduce;
private final HuggingFaceTokenizer tokenizer;
private final OnnxEvaluator evaluator;
@Inject
public SpladeEmbedder(OnnxRuntime onnx, Embedder.Runtime runtime, SpladeEmbedderConfig config) {
+ this(onnx, runtime, config, true);
+ }
+ SpladeEmbedder(OnnxRuntime onnx, Embedder.Runtime runtime, SpladeEmbedderConfig config, boolean useCustomReduce) {
this.runtime = runtime;
inputIdsName = config.transformerInputIds();
attentionMaskName = config.transformerAttentionMask();
outputName = config.transformerOutput();
tokenTypeIdsName = config.transformerTokenTypeIds();
termScoreThreshold = config.termScoreThreshold();
+ this.useCustomReduce = useCustomReduce;
var tokenizerPath = Paths.get(config.tokenizerPath().toString());
var builder = new HuggingFaceTokenizer.Builder()
@@ -117,20 +124,54 @@ public class SpladeEmbedder extends AbstractComponent implements Embedder {
Map<String, Tensor> inputs = Map.of(inputIdsName, inputSequence.expand("d0"),
attentionMaskName, attentionMask.expand("d0"),
tokenTypeIdsName, tokenTypeIds.expand("d0"));
- Tensor spladeTensor = sparsify((IndexedTensor) evaluator.evaluate(inputs).get(outputName), tensorType);
+ IndexedTensor output = (IndexedTensor) evaluator.evaluate(inputs).get(outputName);
+ Tensor spladeTensor = useCustomReduce
+ ? sparsifyCustomReduce(output, tensorType)
+ : sparsifyReduce(output, tensorType);
runtime.sampleEmbeddingLatency((System.nanoTime() - start)/1_000_000d, context);
return spladeTensor;
}
/**
- * Sparsify the model output tensor.
+ * Sparsify the output tensor by applying a threshold on the log of the relu of the output.
+ * This uses generic tensor reduce+map, and is slightly slower than a custom unrolled variant.
+ * @param modelOutput the model output tensor of shape d1,dim where d1 is the sequence length and dim is size
+ * of the vocabulary
+ * @param tensorType the type of the destination tensor
+ * @return A mapped tensor with the terms from the vocab that has a score above the threshold
+ */
+ private Tensor sparsifyReduce(Tensor modelOutput, TensorType tensorType) {
+ //Remove batch dim, batch size of 1
+ Tensor output = modelOutput.reduce(Reduce.Aggregator.max, "d0", "d1");
+ Tensor logOfRelu = output.map((x) -> Math.log(1 + (x > 0 ? x : 0)));
+ IndexedTensor vocab = (IndexedTensor) logOfRelu;
+ var builder = Tensor.Builder.of(tensorType);
+ long[] tokens = new long[1];
+ for (int i = 0; i < vocab.size(); i++) {
+ var score = vocab.get(i);
+ if (score > termScoreThreshold) {
+ tokens[0] = i;
+ String term = tokenizer.decode(tokens);
+ builder.cell().
+ label(tensorType.dimensions().get(0).name(), term)
+ .value(score);
+ }
+ }
+ return builder.build();
+ }
+
+
+
+ /**
+ * Sparsify the model output tensor.This uses an unrolled custom reduce and is 15-20% faster than the using
+ * generic tensor reduce.
*
* @param modelOutput the model output tensor of type tensorType
* @param tensorType the type of the destination tensor
* @return A mapped tensor with the terms from the vocab that has a score above the threshold
*/
- public Tensor sparsify(IndexedTensor modelOutput, TensorType tensorType) {
+ public Tensor sparsifyCustomReduce(IndexedTensor modelOutput, TensorType tensorType) {
var builder = Tensor.Builder.of(tensorType);
long[] shape = modelOutput.shape();
if(shape.length != 3) {
diff --git a/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java b/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java
index b48051814ab..e2b1caf4441 100644
--- a/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java
+++ b/model-integration/src/test/java/ai/vespa/embedding/SpladeEmbedderTest.java
@@ -48,12 +48,12 @@ public class SpladeEmbedderTest {
public void testPerformanceNotTerrible() {
String text = "what was the manhattan project in this context it was a secret project to develop a nuclear weapon in world war" +
" ii the project was led by the united states with the support of the united kingdom and canada";
- Long now = System.currentTimeMillis();
- int n = 1000; // Takes around 7s on Intel core i9 2.4Ghz (macbook pro, 2019)
+ long now = System.currentTimeMillis();
+ int n = 1000; // 7s on Intel core i9 2.4Ghz (macbook pro, 2019) using custom reduce, 8s if using generic reduce
for (int i = 0; i < n; i++) {
assertEmbed("tensor<float>(t{})", text, indexingContext);
}
- Long elapsed = System.currentTimeMillis() - now;
+ long elapsed = System.currentTimeMillis() - now;
System.out.println("Elapsed time: " + elapsed + " ms");
}
@@ -72,9 +72,11 @@ public class SpladeEmbedderTest {
static {
indexingContext = new Embedder.Context("schema.indexing");
- spladeEmbedder = getEmbedder();
+ // Custom reduce is 14% faster than generic reduce and the default.
+ // Keeping as option for performance testing
+ spladeEmbedder = getEmbedder(false);
}
- private static Embedder getEmbedder() {
+ private static Embedder getEmbedder(boolean useCustomReduce) {
String vocabPath = "src/test/models/onnx/transformer/real_tokenizer.json";
String modelPath = "src/test/models/onnx/transformer/dummy_transformer_mlm.onnx";
assumeTrue(OnnxRuntime.isRuntimeAvailable(modelPath));
@@ -83,6 +85,6 @@ public class SpladeEmbedderTest {
builder.transformerModel(ModelReference.valueOf(modelPath));
builder.termScoreThreshold(scoreThreshold);
builder.transformerGpuDevice(-1);
- return new SpladeEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), builder.build());
+ return new SpladeEmbedder(new OnnxRuntime(), Embedder.Runtime.testInstance(), builder.build(), useCustomReduce);
}
}