diff options
author | Bjørn Christian Seime <bjorncs@yahooinc.com> | 2023-05-09 09:49:40 +0200 |
---|---|---|
committer | Bjørn Christian Seime <bjorncs@yahooinc.com> | 2023-05-09 13:02:43 +0200 |
commit | 372bd2c677bb9707c55a9153f860fb2017ce6ffc (patch) | |
tree | 064bccc8e96f496e76ac5be148ab2f91e65c5ed7 | |
parent | 9a41de7b23dce838df8a8ebac42fc41da3478bb9 (diff) |
Reapply "Bjorncs/embedder onnx gpu"
Updated model-integration to depend on configdefinitions instead of searchcore as OnnxModelsConfig is moved there.
13 files changed, 76 insertions, 17 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainer.java b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainer.java index f901bf3c826..9e21fd2d23a 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainer.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainer.java @@ -9,6 +9,7 @@ import com.yahoo.config.model.producer.TreeConfigProducer; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.search.config.QrStartConfig; +import com.yahoo.vespa.config.search.core.OnnxModelsConfig; import com.yahoo.vespa.model.container.component.SimpleComponent; import java.time.Duration; import java.util.Optional; @@ -20,6 +21,7 @@ import java.util.Optional; */ public final class ApplicationContainer extends Container implements QrStartConfig.Producer, + OnnxModelsConfig.Producer, ZookeeperServerConfig.Producer { private final boolean isHostedVespa; @@ -42,12 +44,15 @@ public final class ApplicationContainer extends Container implements @Override public void getConfig(QrStartConfig.Builder builder) { - if (getHostResource() != null) { - NodeResources nodeResources = getHostResource().realResources(); - if ( ! nodeResources.isUnspecified()) { - builder.jvm.availableProcessors(Math.max(2, (int)Math.ceil(nodeResources.vcpu()))); - } - } + realResources().ifPresent(r -> builder.jvm.availableProcessors(Math.max(2, (int) Math.ceil(r.vcpu())))); + } + + @Override + public void getConfig(OnnxModelsConfig.Builder builder) { + realResources().ifPresent(r -> { + int count = r.gpuResources().count(); + if (count >= 0) builder.gpu.count(count); + }); } @Override @@ -84,4 +89,14 @@ public final class ApplicationContainer extends Container implements @Override public Optional<String> getPreShutdownCommand() { return Optional.of(prepareStopCommand(Duration.ofMinutes(6))); } + private Optional<NodeResources> realResources() { + if (getHostResource() != null) { + NodeResources nodeResources = getHostResource().realResources(); + if ( ! nodeResources.isUnspecified()) { + return Optional.of(nodeResources); + } + } + return Optional.empty(); + } + } diff --git a/config-model/src/test/derived/globalphase_onnx_inside/onnx-models.cfg b/config-model/src/test/derived/globalphase_onnx_inside/onnx-models.cfg index d63e85e2f19..99f65336794 100644 --- a/config-model/src/test/derived/globalphase_onnx_inside/onnx-models.cfg +++ b/config-model/src/test/derived/globalphase_onnx_inside/onnx-models.cfg @@ -1,3 +1,4 @@ +gpu.count -1 model[].name "direct" model[].fileref "files/ax_plus_b.onnx" model[].input[].name "vector_B" diff --git a/config-model/src/test/derived/globalphase_token_functions/onnx-models.cfg b/config-model/src/test/derived/globalphase_token_functions/onnx-models.cfg index 6283159c324..cea4c065014 100644 --- a/config-model/src/test/derived/globalphase_token_functions/onnx-models.cfg +++ b/config-model/src/test/derived/globalphase_token_functions/onnx-models.cfg @@ -1,3 +1,4 @@ +gpu.count -1 model[].name "my_ranking_model" model[].fileref "files/ranking_model.onnx" model[].input[].name "input_ids" diff --git a/config-model/src/test/derived/vector_constant/onnx-models.cfg b/config-model/src/test/derived/vector_constant/onnx-models.cfg index 4c52b72b519..1dcaf0e1bd6 100644 --- a/config-model/src/test/derived/vector_constant/onnx-models.cfg +++ b/config-model/src/test/derived/vector_constant/onnx-models.cfg @@ -1,3 +1,4 @@ +gpu.count -1 model[].name "inside" model[].fileref "ax_plus_b.onnx" model[].input[].name "vector_B" diff --git a/configdefinitions/src/vespa/onnx-models.def b/configdefinitions/src/vespa/onnx-models.def index b8f5d319075..67a83e2afb7 100644 --- a/configdefinitions/src/vespa/onnx-models.def +++ b/configdefinitions/src/vespa/onnx-models.def @@ -1,6 +1,9 @@ # Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. namespace=vespa.config.search.core +# Number of GPUs available for ONNX evaluation, or -1 if unknown. +gpu.count int default=-1 + model[].name string model[].fileref file model[].input[].name string diff --git a/model-evaluation/src/main/java/ai/vespa/models/evaluation/ModelsEvaluator.java b/model-evaluation/src/main/java/ai/vespa/models/evaluation/ModelsEvaluator.java index fd5306f9add..303d2acd79e 100644 --- a/model-evaluation/src/main/java/ai/vespa/models/evaluation/ModelsEvaluator.java +++ b/model-evaluation/src/main/java/ai/vespa/models/evaluation/ModelsEvaluator.java @@ -41,7 +41,7 @@ public class ModelsEvaluator extends AbstractComponent { RankingExpressionsConfig expressionsConfig, OnnxModelsConfig onnxModelsConfig, FileAcquirer fileAcquirer) { - this(config, constantsConfig, expressionsConfig, onnxModelsConfig, fileAcquirer, new OnnxRuntime()); + this(config, constantsConfig, expressionsConfig, onnxModelsConfig, fileAcquirer, new OnnxRuntime(onnxModelsConfig)); } public ModelsEvaluator(RankProfilesConfigImporter importer, diff --git a/model-integration/pom.xml b/model-integration/pom.xml index c27ed9d2c31..d5d7ae534a4 100644 --- a/model-integration/pom.xml +++ b/model-integration/pom.xml @@ -40,6 +40,12 @@ </dependency> <dependency> <groupId>com.yahoo.vespa</groupId> + <artifactId>searchcore</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>com.yahoo.vespa</groupId> <artifactId>searchlib</artifactId> <version>${project.version}</version> <scope>provided</scope> diff --git a/model-integration/src/main/java/ai/vespa/embedding/BertBaseEmbedder.java b/model-integration/src/main/java/ai/vespa/embedding/BertBaseEmbedder.java index 8e5211ccff1..b172ef7beee 100644 --- a/model-integration/src/main/java/ai/vespa/embedding/BertBaseEmbedder.java +++ b/model-integration/src/main/java/ai/vespa/embedding/BertBaseEmbedder.java @@ -58,6 +58,7 @@ public class BertBaseEmbedder extends AbstractComponent implements Embedder { OnnxEvaluatorOptions options = new OnnxEvaluatorOptions(); options.setExecutionMode(config.onnxExecutionMode().toString()); options.setThreads(config.onnxInterOpThreads(), config.onnxIntraOpThreads()); + if (config.onnxGpuDevice() >= 0) options.setGpuDevice(config.onnxGpuDevice()); tokenizer = new WordPieceEmbedder.Builder(config.tokenizerVocab().toString()).build(); this.evaluator = onnx.evaluatorOf(config.transformerModel().toString(), options); diff --git a/model-integration/src/main/java/ai/vespa/embedding/huggingface/HuggingFaceEmbedder.java b/model-integration/src/main/java/ai/vespa/embedding/huggingface/HuggingFaceEmbedder.java index 21dd326689c..cc13254385b 100644 --- a/model-integration/src/main/java/ai/vespa/embedding/huggingface/HuggingFaceEmbedder.java +++ b/model-integration/src/main/java/ai/vespa/embedding/huggingface/HuggingFaceEmbedder.java @@ -40,7 +40,7 @@ public class HuggingFaceEmbedder extends AbstractComponent implements Embedder { tokenizer = new HuggingFaceTokenizer(Paths.get(config.tokenizerPath().toString())); var onnxOpts = new OnnxEvaluatorOptions(); if (config.transformerGpuDevice() >= 0) - onnxOpts.setGpuDevice(config.transformerGpuDevice(), config.transformerGpuRequired()); + onnxOpts.setGpuDevice(config.transformerGpuDevice()); onnxOpts.setExecutionMode(config.transformerExecutionMode().toString()); onnxOpts.setThreads(config.transformerInterOpThreads(), config.transformerIntraOpThreads()); evaluator = onnx.evaluatorOf(config.transformerModel().toString(), onnxOpts); diff --git a/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluatorOptions.java b/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluatorOptions.java index 4a35f4275fa..6048be8aca9 100644 --- a/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluatorOptions.java +++ b/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluatorOptions.java @@ -17,7 +17,7 @@ import static ai.onnxruntime.OrtSession.SessionOptions.ExecutionMode.SEQUENTIAL; */ public class OnnxEvaluatorOptions { - private final OrtSession.SessionOptions.OptLevel optimizationLevel; + private OrtSession.SessionOptions.OptLevel optimizationLevel; private OrtSession.SessionOptions.ExecutionMode executionMode; private int interOpThreads; private int intraOpThreads; @@ -86,6 +86,8 @@ public class OnnxEvaluatorOptions { this.gpuDeviceRequired = required; } + public void setGpuDevice(int deviceNumber) { gpuDeviceNumber = deviceNumber; } + public boolean requestingGpu() { return gpuDeviceNumber > -1; } @@ -94,6 +96,19 @@ public class OnnxEvaluatorOptions { return gpuDeviceRequired; } + public int gpuDeviceNumber() { return gpuDeviceNumber; } + + public OnnxEvaluatorOptions copy() { + var copy = new OnnxEvaluatorOptions(); + copy.gpuDeviceNumber = gpuDeviceNumber; + copy.gpuDeviceRequired = gpuDeviceRequired; + copy.executionMode = executionMode; + copy.interOpThreads = interOpThreads; + copy.intraOpThreads = intraOpThreads; + copy.optimizationLevel = optimizationLevel; + return copy; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxRuntime.java b/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxRuntime.java index ece1db55c1e..ab44a2ae33f 100644 --- a/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxRuntime.java +++ b/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxRuntime.java @@ -10,6 +10,7 @@ import com.yahoo.component.annotation.Inject; import com.yahoo.jdisc.ResourceReference; import com.yahoo.jdisc.refcount.DebugReferencesWithStack; import com.yahoo.jdisc.refcount.References; +import com.yahoo.vespa.config.search.core.OnnxModelsConfig; import net.jpountz.xxhash.XXHashFactory; import java.io.IOException; @@ -52,17 +53,24 @@ public class OnnxRuntime extends AbstractComponent { private final Object monitor = new Object(); private final Map<OrtSessionId, SharedOrtSession> sessions = new HashMap<>(); private final OrtSessionFactory factory; + private final int gpusAvailable; - @Inject public OnnxRuntime() { this(defaultFactory); } + // For test use only + public OnnxRuntime() { this(defaultFactory, new OnnxModelsConfig.Builder().build()); } - OnnxRuntime(OrtSessionFactory factory) { this.factory = factory; } + @Inject public OnnxRuntime(OnnxModelsConfig cfg) { this(defaultFactory, cfg); } + + OnnxRuntime(OrtSessionFactory factory, OnnxModelsConfig cfg) { + this.factory = factory; + this.gpusAvailable = cfg.gpu().count(); + } public OnnxEvaluator evaluatorOf(byte[] model) { return new OnnxEvaluator(model, null, this); } public OnnxEvaluator evaluatorOf(byte[] model, OnnxEvaluatorOptions options) { - return new OnnxEvaluator(model, options, this); + return new OnnxEvaluator(model, overrideOptions(options), this); } public OnnxEvaluator evaluatorOf(String modelPath) { @@ -70,7 +78,7 @@ public class OnnxRuntime extends AbstractComponent { } public OnnxEvaluator evaluatorOf(String modelPath, OnnxEvaluatorOptions options) { - return new OnnxEvaluator(modelPath, options, this); + return new OnnxEvaluator(modelPath, overrideOptions(options), this); } public static OrtEnvironment ortEnvironment() { @@ -167,6 +175,16 @@ public class OnnxRuntime extends AbstractComponent { } } + private OnnxEvaluatorOptions overrideOptions(OnnxEvaluatorOptions opts) { + // Set GPU device required if GPU requested and GPUs are available on system + if (gpusAvailable > 0 && opts.requestingGpu() && !opts.gpuDeviceRequired()) { + var copy = opts.copy(); + copy.setGpuDevice(opts.gpuDeviceNumber(), true); + return copy; + } + return opts; + } + int sessionsCached() { synchronized(monitor) { return sessions.size(); } } static class ReferencedOrtSession implements AutoCloseable { diff --git a/model-integration/src/main/resources/configdefinitions/embedding.bert-base-embedder.def b/model-integration/src/main/resources/configdefinitions/embedding.bert-base-embedder.def index ef42d81e1fe..e37a33d3b81 100644 --- a/model-integration/src/main/resources/configdefinitions/embedding.bert-base-embedder.def +++ b/model-integration/src/main/resources/configdefinitions/embedding.bert-base-embedder.def @@ -28,4 +28,4 @@ transformerOutput string default=output_0 onnxExecutionMode enum { parallel, sequential } default=sequential onnxInterOpThreads int default=1 onnxIntraOpThreads int default=-4 # n=number of threads -> n<0: CPUs/(-n), n==0: CPUs, n>0: n - +onnxGpuDevice int default=-1 diff --git a/model-integration/src/main/resources/configdefinitions/embedding.huggingface.hugging-face-embedder.def b/model-integration/src/main/resources/configdefinitions/embedding.huggingface.hugging-face-embedder.def index adc8f653168..584f23046ba 100644 --- a/model-integration/src/main/resources/configdefinitions/embedding.huggingface.hugging-face-embedder.def +++ b/model-integration/src/main/resources/configdefinitions/embedding.huggingface.hugging-face-embedder.def @@ -17,9 +17,6 @@ transformerAttentionMask string default=attention_mask # Output name transformerOutput string default=last_hidden_state -# GPU configuration -transformerGpuDevice int default=-1 -transformerGpuRequired bool default=false # Normalize tensors from tokenizer normalize bool default=false @@ -28,3 +25,4 @@ normalize bool default=false transformerExecutionMode enum { parallel, sequential } default=sequential transformerInterOpThreads int default=1 transformerIntraOpThreads int default=-4 +transformerGpuDevice int default=-1 |