diff options
9 files changed, 49 insertions, 22 deletions
diff --git a/config-provisioning/src/main/resources/configdefinitions/config.provisioning.node-repository.def b/config-provisioning/src/main/resources/configdefinitions/config.provisioning.node-repository.def index b054f434322..6a9c388f8b0 100644 --- a/config-provisioning/src/main/resources/configdefinitions/config.provisioning.node-repository.def +++ b/config-provisioning/src/main/resources/configdefinitions/config.provisioning.node-repository.def @@ -7,6 +7,9 @@ containerImage string default="registry.example.com:9999/myorg/vespa" # Default container image to use for tenant nodes. If this is unset (empty), it defaults to containerImage. tenantContainerImage string default="" +# Default container image to use for tenant nodes with GPU resources. If this is unset (empty), starting nodes with GPUs will fail +tenantGpuContainerImage string default="" + # Whether to cache data read from ZooKeeper in-memory. useCuratorClientCache bool default=false diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index fb21b009a30..c490c50c940 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -7,7 +7,6 @@ import com.yahoo.concurrent.maintenance.JobControl; import com.yahoo.config.provision.ApplicationTransaction; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.DockerImage; -import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.Zone; import com.yahoo.config.provisioning.NodeRepositoryConfig; @@ -88,7 +87,8 @@ public class NodeRepository extends AbstractComponent { zone, new DnsNameResolver(), DockerImage.fromString(config.containerImage()), - Optional.of(config.tenantContainerImage()).filter(s -> !s.isEmpty()).map(DockerImage::fromString), + optionalImage(config.tenantContainerImage()), + optionalImage(config.tenantGpuContainerImage()), flagSource, metricsDb, orchestrator, @@ -109,6 +109,7 @@ public class NodeRepository extends AbstractComponent { NameResolver nameResolver, DockerImage containerImage, Optional<DockerImage> tenantContainerImage, + Optional<DockerImage> tenantGpuContainerImage, FlagSource flagSource, MetricsDb metricsDb, Orchestrator orchestrator, @@ -132,7 +133,7 @@ public class NodeRepository extends AbstractComponent { this.osVersions = new OsVersions(this); this.infrastructureVersions = new InfrastructureVersions(db); this.firmwareChecks = new FirmwareChecks(db, clock); - this.containerImages = new ContainerImages(containerImage, tenantContainerImage); + this.containerImages = new ContainerImages(containerImage, tenantContainerImage, tenantGpuContainerImage); this.archiveUris = new ArchiveUris(db); this.jobControl = new JobControl(new JobControlFlags(db, flagSource)); this.loadBalancers = new LoadBalancers(db); @@ -231,4 +232,8 @@ public class NodeRepository extends AbstractComponent { applications.remove(transaction); } + private static Optional<DockerImage> optionalImage(String image) { + return Optional.of(image).filter(s -> !s.isEmpty()).map(DockerImage::fromString); + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/ContainerImages.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/ContainerImages.java index f1358788c17..8553172cef3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/ContainerImages.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/ContainerImages.java @@ -12,7 +12,7 @@ import java.util.Optional; * This class decides the container image to use for a given node. Two sources are considered, in the following order: * * 1. Requested image (from node allocation, this is set by either a feature flag or through services.xml) - * 2. Default image, specified in the node repository config file + * 2. Default image for the node type/configuration, specified in the node repository config file. * * Independent of source, the registry part of the image is rewritten to match the one set in the node repository config * file. @@ -24,10 +24,12 @@ public class ContainerImages { private final DockerImage defaultImage; private final Optional<DockerImage> tenantImage; + private final Optional<DockerImage> tenantGpuImage; - public ContainerImages(DockerImage defaultImage, Optional<DockerImage> tenantContainerImage) { + public ContainerImages(DockerImage defaultImage, Optional<DockerImage> tenantContainerImage, Optional<DockerImage> tenantGpuImage) { this.defaultImage = Objects.requireNonNull(defaultImage); this.tenantImage = Objects.requireNonNull(tenantContainerImage); + this.tenantGpuImage = Objects.requireNonNull(tenantGpuImage); } /** Returns the container image to use for given node */ @@ -39,7 +41,13 @@ public class ContainerImages { if (requestedImage.isPresent()) { image = requestedImage.get(); } else if (nodeType == NodeType.tenant) { - image = tenantImage.orElse(defaultImage); + if (tenantImage.isPresent()) { + image = tenantImage.get(); + } else if (!node.resources().gpuResources().isZero()) { + image = tenantGpuImage.orElseThrow(() -> new IllegalArgumentException(node + " has GPU resources, but there is no GPU container image available")); + } else { + image = defaultImage; + } } else { image = defaultImage; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index 93e33051616..5bd53a2f8af 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -81,6 +81,7 @@ public class MockNodeRepository extends NodeRepository { new MockNameResolver().mockAnyLookup(), DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"), Optional.empty(), + Optional.empty(), new InMemoryFlagSource(), new MemoryMetricsDb(Clock.fixed(Instant.ofEpochMilli(123), ZoneId.of("Z"))), new OrchestratorMock(), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTester.java index cd73914850d..b964bf871c1 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTester.java @@ -45,6 +45,7 @@ public class NodeRepositoryTester { new MockNameResolver().mockAnyLookup(), DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"), Optional.empty(), + Optional.empty(), new InMemoryFlagSource(), new MemoryMetricsDb(clock), new OrchestratorMock(), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java index d9eef310c20..606bc55fdd2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java @@ -72,6 +72,7 @@ public class CapacityCheckerTester { new MockNameResolver().mockAnyLookup(), DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"), Optional.empty(), + Optional.empty(), new InMemoryFlagSource(), new MemoryMetricsDb(clock), new OrchestratorMock(), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java index 00fff017836..c9421f098e7 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java @@ -266,6 +266,7 @@ public class SpareCapacityMaintainerTest { new MockNameResolver().mockAnyLookup(), DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"), Optional.empty(), + Optional.empty(), new InMemoryFlagSource(), new MemoryMetricsDb(clock), new OrchestratorMock(), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ContainerImagesTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ContainerImagesTest.java index 217ead40b81..20b299c85bc 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ContainerImagesTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ContainerImagesTest.java @@ -5,14 +5,11 @@ import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterMembership; import com.yahoo.config.provision.DockerImage; -import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.Generation; -import com.yahoo.vespa.hosted.provision.node.IP; -import com.yahoo.vespa.hosted.provision.testutils.MockNodeFlavors; import org.junit.Test; import java.util.Optional; @@ -29,7 +26,8 @@ public class ContainerImagesTest { public void image_selection() { DockerImage defaultImage = DockerImage.fromString("registry.example.com/vespa/default"); DockerImage tenantImage = DockerImage.fromString("registry.example.com/vespa/tenant"); - ContainerImages images = new ContainerImages(defaultImage, Optional.of(tenantImage)); + DockerImage gpuImage = DockerImage.fromString("registry.example.com/vespa/tenant-gpu"); + ContainerImages images = new ContainerImages(defaultImage, Optional.of(tenantImage), Optional.of(gpuImage)); assertEquals(defaultImage, images.get(node(NodeType.confighost))); // For preload purposes assertEquals(defaultImage, images.get(node(NodeType.config))); @@ -45,27 +43,35 @@ public class ContainerImagesTest { assertEquals(requested, images.get(node(NodeType.tenant, requested))); // When there is no custom tenant image, the default one is used - images = new ContainerImages(defaultImage, Optional.empty()); + images = new ContainerImages(defaultImage, Optional.empty(), Optional.of(gpuImage)); assertEquals(defaultImage, images.get(node(NodeType.host))); assertEquals(defaultImage, images.get(node(NodeType.tenant))); + + // Choose GPU when node has GPU resources + assertEquals(gpuImage, images.get(node(NodeType.tenant, null, true))); } private static Node node(NodeType type) { - return node(type, null); + return node(type, null, false); } private static Node node(NodeType type, DockerImage requested) { - Flavor flavor = new MockNodeFlavors().getFlavorOrThrow("default"); - Node.Builder b = Node.create(type + "1", new IP.Config(Set.of(), Set.of()), type + "1.example.com", flavor, type); - if (requested != null) { - b.allocation(new Allocation(ApplicationId.defaultId(), - ClusterMembership.from("container/id1/4/37", - Version.fromString("1.2.3"), - Optional.of(requested)), - NodeResources.unspecified(), - Generation.initial(), - false)); + return node(type, requested, false); + } + + private static Node node(NodeType type, DockerImage requested, boolean gpu) { + NodeResources resources = new NodeResources(4, 8, 100, 0.3); + if (gpu) { + resources = resources.with(new NodeResources.GpuResources(1, 16)); } + Node.Builder b = Node.reserve(Set.of("::1"), type + "1", "parent1", resources, type); + b.allocation(new Allocation(ApplicationId.defaultId(), + ClusterMembership.from("container/id1/4/37", + Version.fromString("1.2.3"), + Optional.ofNullable(requested)), + resources, + Generation.initial(), + false)); return b.build(); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index 405d9578c95..110569a371a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -113,6 +113,7 @@ public class ProvisioningTester { nameResolver, containerImage, Optional.empty(), + Optional.empty(), flagSource, new MemoryMetricsDb(clock), orchestrator, |