summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2023-02-09 12:11:21 +0100
committerMartin Polden <mpolden@mpolden.no>2023-02-09 12:12:20 +0100
commite02adcebeb90661046893a2c386e330cdc925327 (patch)
tree317427d7f6ed1af49205017670ac87fc2c2089b3
parent5c5a79813ea6d6d57326f82145816200b8a41622 (diff)
Fix GPU detection
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java6
-rw-r--r--config-model/src/test/cfg/application/onnx/services.xml9
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java11
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/ml/StatelessOnnxEvaluationTest.java89
-rw-r--r--model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluator.java2
5 files changed, 78 insertions, 39 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java b/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java
index 81626581722..59c0b668057 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java
@@ -688,9 +688,9 @@ public class ContainerModelBuilder extends ConfigModelBuilder<ContainerModel> {
Element gpuDeviceElement = XML.getChild(modelElement, "gpu-device");
if (gpuDeviceElement != null) {
int gpuDevice = Integer.parseInt(gpuDeviceElement.getTextContent());
- Capacity capacity = context.getDeployState().provisioned().all().get(cluster.id());
- boolean gpuProvisioned = capacity != null && !capacity.minResources().nodeResources().gpuResources().isZero();
- onnxModel.setGpuDevice(gpuDevice, gpuProvisioned);
+ boolean hasGpu = cluster.getContainers().stream().anyMatch(container -> container.getHostResource() != null &&
+ !container.getHostResource().realResources().gpuResources().isZero());
+ onnxModel.setGpuDevice(gpuDevice, hasGpu);
}
}
diff --git a/config-model/src/test/cfg/application/onnx/services.xml b/config-model/src/test/cfg/application/onnx/services.xml
index b17e34e66c2..8c60be77ff5 100644
--- a/config-model/src/test/cfg/application/onnx/services.xml
+++ b/config-model/src/test/cfg/application/onnx/services.xml
@@ -17,8 +17,10 @@
</models>
</onnx>
</model-evaluation>
- <nodes>
- <node hostalias="node1" />
+ <nodes count="2">
+ <resources vcpu="4" memory="16Gb" disk="125Gb">
+ <gpu count="1" memory="16Gb"/>
+ </resources>
</nodes>
</container>
@@ -27,9 +29,6 @@
<documents>
<document mode="index" type="test"/>
</documents>
- <nodes>
- <node distribution-key="0" hostalias="node1" />
- </nodes>
</content>
</services>
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java b/config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java
index 598b6b103bf..97c222e75d3 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java
@@ -44,12 +44,15 @@ public class ImportedModelTester {
private final DeployState deployState;
public ImportedModelTester(String modelName, Path applicationDir) {
+ this(modelName, applicationDir, new DeployState.Builder());
+ }
+
+ public ImportedModelTester(String modelName, Path applicationDir, DeployState.Builder deployStateBuilder) {
this.modelName = modelName;
this.applicationDir = applicationDir;
- deployState = new DeployState.Builder()
- .applicationPackage(ApplicationPackageTester.create(applicationDir.toString()).app())
- .modelImporters(importers)
- .build();
+ deployState = deployStateBuilder.applicationPackage(ApplicationPackageTester.create(applicationDir.toString()).app())
+ .modelImporters(importers)
+ .build();
}
public VespaModel createVespaModel() {
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/ml/StatelessOnnxEvaluationTest.java b/config-model/src/test/java/com/yahoo/vespa/model/ml/StatelessOnnxEvaluationTest.java
index 8ccbe99f70a..a731e9c7ccc 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/ml/StatelessOnnxEvaluationTest.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/ml/StatelessOnnxEvaluationTest.java
@@ -7,6 +7,10 @@ import ai.vespa.models.evaluation.Model;
import ai.vespa.models.evaluation.ModelsEvaluator;
import com.yahoo.component.ComponentId;
import com.yahoo.config.application.api.ApplicationPackage;
+import com.yahoo.config.model.deploy.DeployState;
+import com.yahoo.config.model.deploy.TestProperties;
+import com.yahoo.config.model.provision.InMemoryProvisioner;
+import com.yahoo.config.provision.NodeResources;
import com.yahoo.filedistribution.fileacquirer.FileAcquirer;
import com.yahoo.filedistribution.fileacquirer.MockFileAcquirer;
import com.yahoo.io.IOUtils;
@@ -21,13 +25,14 @@ import com.yahoo.vespa.model.container.ApplicationContainerCluster;
import org.junit.jupiter.api.Test;
import java.io.File;
-import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
/**
@@ -60,23 +65,23 @@ public class StatelessOnnxEvaluationTest {
}
@Test
- void testStatelessOnnxModelEvaluation() throws IOException {
+ void testStatelessOnnxModelEvaluation() throws Exception {
assumeTrue(OnnxEvaluator.isRuntimeAvailable());
Path appDir = Path.fromString("src/test/cfg/application/onnx");
Path storedAppDir = appDir.append("copy");
try {
- ImportedModelTester tester = new ImportedModelTester("onnx_rt", appDir);
- assertModelEvaluation(tester.createVespaModel(), appDir);
+ ImportedModelTester tester = new ImportedModelTester("onnx_rt", appDir, new DeployState.Builder());
+ assertModelEvaluation(tester.createVespaModel(), appDir, false);
// At this point the expression is stored - copy application to another location which does not have a models dir
storedAppDir.toFile().mkdirs();
IOUtils.copy(appDir.append("services.xml").toString(), storedAppDir.append("services.xml").toString());
IOUtils.copyDirectory(appDir.append(ApplicationPackage.MODELS_GENERATED_DIR).toFile(),
- storedAppDir.append(ApplicationPackage.MODELS_GENERATED_DIR).toFile());
+ storedAppDir.append(ApplicationPackage.MODELS_GENERATED_DIR).toFile());
IOUtils.copyDirectory(appDir.append(ApplicationPackage.SCHEMAS_DIR).toFile(),
- storedAppDir.append(ApplicationPackage.SCHEMAS_DIR).toFile());
+ storedAppDir.append(ApplicationPackage.SCHEMAS_DIR).toFile());
ImportedModelTester storedTester = new ImportedModelTester("onnx_rt", storedAppDir);
- assertModelEvaluation(storedTester.createVespaModel(), appDir);
+ assertModelEvaluation(storedTester.createVespaModel(), appDir, false);
} finally {
IOUtils.recursiveDeleteDir(appDir.append(ApplicationPackage.MODELS_GENERATED_DIR).toFile());
@@ -84,7 +89,26 @@ public class StatelessOnnxEvaluationTest {
}
}
- private void assertModelEvaluation(VespaModel model, Path appDir) {
+ @Test
+ void testStatelessOnnxModelEvaluationWithGpu() {
+ assumeTrue(OnnxEvaluator.isRuntimeAvailable());
+ NodeResources resources = new NodeResources(4, 16, 125, 10,
+ NodeResources.DiskSpeed.fast, NodeResources.StorageType.local,
+ NodeResources.Architecture.x86_64,
+ new NodeResources.GpuResources(1, 16));
+ InMemoryProvisioner provisioner = new InMemoryProvisioner(6, resources, false);
+ DeployState.Builder deployStateBuilder = new DeployState.Builder().modelHostProvisioner(provisioner)
+ .properties(new TestProperties().setMultitenant(true).setHostedVespa(true));
+ Path appDir = Path.fromString("src/test/cfg/application/onnx");
+ try {
+ ImportedModelTester tester = new ImportedModelTester("onnx_rt", appDir, deployStateBuilder);
+ assertModelEvaluation(tester.createVespaModel(), appDir, true);
+ } finally {
+ IOUtils.recursiveDeleteDir(appDir.append(ApplicationPackage.MODELS_GENERATED_DIR).toFile());
+ }
+ }
+
+ private void assertModelEvaluation(VespaModel model, Path appDir, boolean shouldRequireGpu) {
ApplicationContainerCluster cluster = model.getContainerClusters().get("container");
assertNotNull(cluster.getComponentsMap().get(new ComponentId(ModelsEvaluator.class.getName())));
@@ -108,29 +132,42 @@ public class StatelessOnnxEvaluationTest {
Set<String> modelNames = config.rankprofile().stream().map(v -> v.name()).collect(Collectors.toSet());
assertTrue(modelNames.contains("mul"));
+ OnnxModelsConfig.Model mulModel = onnxModelsConfig.model().get(0);
+ assertEquals(2, mulModel.stateless_intraop_threads());
+ assertEquals(-1, mulModel.stateless_interop_threads());
+ assertEquals("", mulModel.stateless_execution_mode());
+ assertEquals(shouldRequireGpu, mulModel.gpu_device_required());
+ assertEquals(0, mulModel.gpu_device());
+
// This is actually how ModelsEvaluator is injected
Map<String, File> fileMap = new HashMap<>();
for (OnnxModelsConfig.Model onnxModel : onnxModelsConfig.model()) {
fileMap.put(onnxModel.fileref().value(), appDir.append(onnxModel.fileref().value()).toFile());
}
FileAcquirer fileAcquirer = MockFileAcquirer.returnFiles(fileMap);
- ModelsEvaluator modelsEvaluator = new ModelsEvaluator(config, constantsConfig, expressionsConfig, onnxModelsConfig, fileAcquirer);
- assertEquals(1, modelsEvaluator.models().size());
-
- Model mul = modelsEvaluator.models().get("mul");
- FunctionEvaluator evaluator = mul.evaluatorOf(); // or "default.output" - or actually use name of model output
-
- Tensor input1 = Tensor.from("tensor<float>(d0[1]):[2]");
- Tensor input2 = Tensor.from("tensor<float>(d0[1]):[3]");
- Tensor output = evaluator.bind("input1", input1).bind("input2", input2).evaluate();
- assertEquals(6.0, output.sum().asDouble(), 1e-9);
-
- OnnxModelsConfig.Model mulModel = onnxModelsConfig.model().get(0);
- assertEquals(2, mulModel.stateless_intraop_threads());
- assertEquals(-1, mulModel.stateless_interop_threads());
- assertEquals("", mulModel.stateless_execution_mode());
- assertFalse(mulModel.gpu_device_required());
- assertEquals(0, mulModel.gpu_device());
+ try {
+ ModelsEvaluator modelsEvaluator = new ModelsEvaluator(config, constantsConfig, expressionsConfig, onnxModelsConfig, fileAcquirer);
+ assertEquals(1, modelsEvaluator.models().size());
+
+ Model mul = modelsEvaluator.models().get("mul");
+ FunctionEvaluator evaluator = mul.evaluatorOf(); // or "default.output" - or actually use name of model output
+
+ Tensor input1 = Tensor.from("tensor<float>(d0[1]):[2]");
+ Tensor input2 = Tensor.from("tensor<float>(d0[1]):[3]");
+ Tensor output = evaluator.bind("input1", input1).bind("input2", input2).evaluate();
+ assertEquals(6.0, output.sum().asDouble(), 1e-9);
+ } catch (IllegalArgumentException e) {
+ boolean gotWantedException = false;
+ for (Throwable cause = e; shouldRequireGpu && cause != null; cause = cause.getCause()) {
+ if (cause.getMessage().equals("GPU device is required, but CUDA initialization failed")) {
+ gotWantedException = true;
+ break;
+ }
+ }
+ if (!gotWantedException) {
+ throw e;
+ }
+ }
}
}
diff --git a/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluator.java b/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluator.java
index 563ef911f8f..5cc7991d197 100644
--- a/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluator.java
+++ b/model-integration/src/main/java/ai/vespa/modelintegration/evaluator/OnnxEvaluator.java
@@ -101,7 +101,7 @@ public class OnnxEvaluator {
return createSession(modelPath, environment, options, false);
}
if (isCudaError(e)) {
- throw new IllegalArgumentException("GPU device is requested, but CUDA initialization failed", e);
+ throw new IllegalArgumentException("GPU device is required, but CUDA initialization failed", e);
}
throw new RuntimeException("ONNX Runtime exception", e);
}