From 412ee6ba430ecf8be28fcf143733556269952f0e Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Mon, 5 Jun 2023 11:21:37 +0200 Subject: Upgrade host flavor --- .../com/yahoo/vespa/hosted/provision/Node.java | 9 +- .../provision/maintenance/HostFlavorUpgrader.java | 104 +++++++++++++++++++++ .../maintenance/MaintenanceDeployment.java | 13 +++ .../maintenance/NodeRepositoryMaintenance.java | 6 +- .../yahoo/vespa/hosted/provision/node/Agent.java | 1 + .../yahoo/vespa/hosted/provision/node/Nodes.java | 26 ++++-- .../yahoo/vespa/hosted/provision/node/Status.java | 42 ++++++--- .../provision/persistence/NodeSerializer.java | 9 +- .../provision/provisioning/GroupPreparer.java | 2 +- .../provision/provisioning/HostProvisioner.java | 3 + .../provision/provisioning/NodeAllocation.java | 36 ++++--- .../hosted/provision/restapi/NodePatcher.java | 7 +- .../provision/testutils/MockHostProvisioner.java | 54 ++++++++--- .../provision/testutils/MockNodeRepository.java | 2 +- .../vespa/hosted/provision/NodeRepositoryTest.java | 2 +- .../provision/maintenance/DirtyExpirerTest.java | 11 +-- .../maintenance/HostCapacityMaintainerTest.java | 2 +- .../maintenance/HostFlavorUpgraderTest.java | 85 +++++++++++++++++ .../provision/maintenance/HostRetirerTest.java | 12 +-- .../vespa/hosted/provision/os/OsVersionsTest.java | 2 +- .../provision/persistence/NodeSerializerTest.java | 6 +- .../provisioning/DynamicProvisioningTest.java | 28 ++---- .../provision/provisioning/ProvisioningTester.java | 18 ++++ 23 files changed, 372 insertions(+), 108 deletions(-) create mode 100644 node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java create mode 100644 node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgraderTest.java (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 20bd0be18b6..fe4f4987d34 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -276,7 +276,7 @@ public final class Node implements Nodelike { * If both given wantToRetire and wantToDeprovision are equal to the current values, the method is no-op. */ public Node withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, Agent agent, Instant at) { - return withWantToRetire(wantToRetire, wantToDeprovision, false, agent, at); + return withWantToRetire(wantToRetire, wantToDeprovision, false, false, agent, at); } /** @@ -285,15 +285,16 @@ public final class Node implements Nodelike { * * If all given values are equal to the current ones, the method is no-op. */ - public Node withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, boolean wantToRebuild, Agent agent, Instant at) { + public Node withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, boolean wantToRebuild, boolean wantToUpgradeFlavor, Agent agent, Instant at) { if (wantToRetire == status.wantToRetire() && wantToDeprovision == status.wantToDeprovision() && - wantToRebuild == status.wantToRebuild()) return this; + wantToRebuild == status.wantToRebuild() && + wantToUpgradeFlavor == status.wantToUpgradeFlavor()) return this; if (wantToRebuild && !wantToRetire && resources().storageType() != NodeResources.StorageType.remote) { throw new IllegalArgumentException("Cannot rebuild " + this + " without retiring because storage is " + resources().storageType()); } - Node node = this.with(status.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild)); + Node node = this.with(status.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild, wantToUpgradeFlavor)); if (wantToRetire) node = node.with(history.with(new History.Event(History.Event.Type.wantToRetire, agent, at))); return node; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java new file mode 100644 index 00000000000..6f30d7fedf1 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java @@ -0,0 +1,104 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.Deployer; +import com.yahoo.config.provision.NodeAllocationException; +import com.yahoo.config.provision.NodeType; +import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; + +import java.time.Duration; +import java.util.Objects; +import java.util.Optional; +import java.util.Random; +import java.util.logging.Level; + +/** + * This maintainer attempts to upgrade a single host running on an older flavor generation. The upgrade happens by + * marking and retiring the host on the old generation, and redeploying to provision a replacement host on a newer + * generation. + * + * If the cloud provider reports a lack of capacity for the newer generation, retirement of the host is + * cancelled, and upgrade is attempted of the next host on an old flavor, if any. + * + * Once a host has been marked for upgrade, {@link HostResumeProvisioner} will complete provisioning of the replacement + * host. + * + * @author mpolden + */ +public class HostFlavorUpgrader extends NodeRepositoryMaintainer { + + private final HostProvisioner hostProvisioner; + private final Random random; + private final Deployer deployer; + private final Metric metric; + + public HostFlavorUpgrader(NodeRepository nodeRepository, Duration interval, Metric metric, Deployer deployer, HostProvisioner hostProvisioner) { + super(nodeRepository, interval, metric); + this.hostProvisioner = Objects.requireNonNull(hostProvisioner); + this.deployer = Objects.requireNonNull(deployer); + this.metric = Objects.requireNonNull(metric); + this.random = new Random(nodeRepository.clock().millis()); + } + + @Override + protected double maintain() { + if (!nodeRepository().zone().cloud().dynamicProvisioning()) return 1.0; // Not relevant in zones with static capacity + if (nodeRepository().zone().environment().isTest()) return 1.0; // Short-lived deployments + if (!nodeRepository().nodes().isWorking()) return 0.0; + + NodeList allNodes = nodeRepository().nodes().list(); + if (!NodeMover.zoneIsStable(allNodes)) return 1.0; + return upgradeHostFlavor(allNodes); + } + + private double upgradeHostFlavor(NodeList allNodes) { + NodeList activeNodes = allNodes.nodeType(NodeType.tenant) + .state(Node.State.active) + .shuffle(random); // Shuffle to avoid getting stuck trying to upgrade the same host + int attempts = 0; + int failures = 0; + for (var node : activeNodes) { + Optional parent = allNodes.parentOf(node); + if (parent.isEmpty()) continue; + if (!hostProvisioner.canUpgradeFlavor(parent.get(), node)) continue; + if (parent.get().status().wantToUpgradeFlavor()) continue; // Already upgrading + + boolean redeployed = false; + boolean deploymentValid = false; + try (MaintenanceDeployment deployment = new MaintenanceDeployment(node.allocation().get().owner(), deployer, metric, nodeRepository(), true)) { + deploymentValid = deployment.isValid(); + if (!deploymentValid) continue; + + attempts++; + log.log(Level.INFO, () -> "Redeploying " + node.allocation().get().owner() + " to upgrade flavor (" + + parent.get().flavor().name() + ") of " + parent.get()); + upgradeFlavor(parent.get(), true); + deployment.activate(); + redeployed = true; + return asSuccessFactorDeviation(attempts, failures); + } catch (NodeAllocationException e) { + log.log(Level.WARNING, "No capacity for flavor upgrade of " + parent + ". Retrying in " + + interval(), e); + } finally { + if (deploymentValid && !redeployed) { // Cancel upgrade if redeploy failed + failures++; + upgradeFlavor(parent.get(), false); + } + } + } + return 1.0; + } + + private void upgradeFlavor(Node host, boolean upgrade) { + nodeRepository().nodes().upgradeFlavor(host.hostname(), + Agent.HostFlavorUpgrader, + nodeRepository().clock().instant(), + upgrade); + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java index be398f6e8ad..14013007da0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java @@ -38,6 +38,7 @@ class MaintenanceDeployment implements Closeable { private final Metric metric; private final Optional lock; private final Optional deployment; + private final boolean throwOnFailure; private boolean closed = false; @@ -45,8 +46,17 @@ class MaintenanceDeployment implements Closeable { Deployer deployer, Metric metric, NodeRepository nodeRepository) { + this(application, deployer, metric, nodeRepository, false); + } + + public MaintenanceDeployment(ApplicationId application, + Deployer deployer, + Metric metric, + NodeRepository nodeRepository, + boolean throwOnFailure) { this.application = application; this.metric = metric; + this.throwOnFailure = throwOnFailure; Optional lock = tryLock(application, nodeRepository); try { @@ -93,6 +103,9 @@ class MaintenanceDeployment implements Closeable { } catch (RuntimeException e) { metric.add(ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_FAILURE.baseName(), 1, metric.createContext(Map.of())); log.log(Level.WARNING, "Exception on maintenance deploy of " + application, e); + if (throwOnFailure) { + throw e; + } return Optional.empty(); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 67ab36c725e..c7d5efe376b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -70,7 +70,9 @@ public class NodeRepositoryMaintenance extends AbstractComponent { new HostDeprovisioner(nodeRepository, defaults.hostDeprovisionerInterval, metric, hostProvisioner), new HostResumeProvisioner(nodeRepository, defaults.hostResumeProvisionerInterval, metric, hostProvisioner), new HostRetirer(nodeRepository, defaults.hostRetirerInterval, metric, hostProvisioner), - new DiskReplacer(nodeRepository, defaults.diskReplacerInterval, metric, hostProvisioner))) + new DiskReplacer(nodeRepository, defaults.diskReplacerInterval, metric, hostProvisioner), + new HostFlavorUpgrader(nodeRepository, defaults.hostFlavorUpgraderInterval, metric, deployer, hostProvisioner)) + ) .ifPresent(maintainers::addAll); // The DuperModel is filled with infrastructure applications by the infrastructure provisioner, so explicitly run that now infrastructureProvisioner.maintainButThrowOnException(); @@ -118,6 +120,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration scalingSuggestionsInterval; private final Duration switchRebalancerInterval; private final Duration hostRetirerInterval; + private final Duration hostFlavorUpgraderInterval; private final NodeFailer.ThrottlePolicy throttlePolicy; @@ -151,6 +154,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { switchRebalancerInterval = Duration.ofHours(1); throttlePolicy = NodeFailer.ThrottlePolicy.hosted; hostRetirerInterval = Duration.ofMinutes(30); + hostFlavorUpgraderInterval = Duration.ofMinutes(30); if (zone.environment().isProduction() && ! isCdZone) { inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java index 0b2bdb4620a..e03b77b91b8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java @@ -29,5 +29,6 @@ public enum Agent { SpareCapacityMaintainer, SwitchRebalancer, HostEncrypter, + HostFlavorUpgrader, } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 1a97f1e02f7..29327aaf93a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -167,7 +167,8 @@ public class Nodes { if (rebuilding) { node = node.with(node.status().withWantToRetire(existing.get().status().wantToRetire(), false, - rebuilding)); + rebuilding, + existing.get().status().wantToUpgradeFlavor())); } nodesToRemove.add(existing.get()); } @@ -190,7 +191,7 @@ public class Nodes { if (node.status().wantToDeprovision() || node.status().wantToRebuild()) return park(node.hostname(), false, agent, reason); - node = node.withWantToRetire(false, false, false, agent, clock.instant()); + node = node.withWantToRetire(false, false, false, false, agent, clock.instant()); return db.writeTo(Node.State.ready, node, agent, Optional.of(reason)); } @@ -638,6 +639,11 @@ public class Nodes { return decommission(hostname, soft ? HostOperation.softRebuild : HostOperation.rebuild, agent, instant); } + /** Upgrade flavor for given host */ + public List upgradeFlavor(String hostname, Agent agent, Instant instant, boolean upgrade) { + return decommission(hostname, upgrade ? HostOperation.upgradeFlavor : HostOperation.cancel, agent, instant); + } + private List decommission(String hostname, HostOperation op, Agent agent, Instant instant) { Optional nodeMutex = lockAndGet(hostname); if (nodeMutex.isEmpty()) return List.of(); @@ -645,20 +651,20 @@ public class Nodes { boolean wantToDeprovision = op == HostOperation.deprovision; boolean wantToRebuild = op == HostOperation.rebuild || op == HostOperation.softRebuild; boolean wantToRetire = op.needsRetirement(); + boolean wantToUpgradeFlavor = op == HostOperation.upgradeFlavor; Node host = nodeMutex.get().node(); try (NodeMutex lock = nodeMutex.get()) { if ( ! host.type().isHost()) throw new IllegalArgumentException("Cannot " + op + " non-host " + host); try (Mutex allocationLock = lockUnallocated()) { // Modify parent with wantToRetire while holding the allocationLock to prevent // any further allocation of nodes on this host - Node newHost = lock.node().withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild, agent, instant); + Node newHost = lock.node().withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild, wantToUpgradeFlavor, agent, instant); result.add(write(newHost, lock)); } } - - if (wantToRetire) { // Apply recursively if we're retiring + if (wantToRetire || op == HostOperation.cancel) { // Apply recursively if we're retiring, or cancelling List updatedNodes = performOn(list().childrenOf(host), (node, nodeLock) -> { - Node newNode = node.withWantToRetire(wantToRetire, wantToDeprovision, false, agent, instant); + Node newNode = node.withWantToRetire(wantToRetire, wantToDeprovision, false, false, agent, instant); return write(newNode, nodeLock); }); result.addAll(updatedNodes); @@ -888,7 +894,13 @@ public class Nodes { rebuild(true), /** Host is stopped and re-bootstrapped, data is preserved */ - softRebuild(false); + softRebuild(false), + + /** Host flavor should be upgraded, data is destroyed */ + upgradeFlavor(true), + + /** Attempt to cancel any ongoing operations. If the current operation has progressed too far, cancelling won't have any effect */ + cancel(false); private final boolean needsRetirement; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java index ef0f899ca3e..6e5c5a07fc2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java @@ -24,6 +24,7 @@ public class Status { private final boolean wantToRebuild; private final boolean preferToRetire; private final boolean wantToFail; + private final boolean wantToUpgradeFlavor; private final OsVersion osVersion; private final Optional firmwareVerifiedAt; @@ -36,18 +37,23 @@ public class Status { boolean wantToRebuild, boolean preferToRetire, boolean wantToFail, + boolean wantToUpgradeFlavor, OsVersion osVersion, Optional firmwareVerifiedAt) { this.reboot = Objects.requireNonNull(generation, "Generation must be non-null"); this.vespaVersion = Objects.requireNonNull(vespaVersion, "Vespa version must be non-null").filter(v -> !Version.emptyVersion.equals(v)); this.containerImage = Objects.requireNonNull(containerImage, "Container image must be non-null").filter(d -> !DockerImage.EMPTY.equals(d)); this.failCount = failCount; + this.wantToUpgradeFlavor = wantToUpgradeFlavor; if (wantToDeprovision && wantToRebuild) { throw new IllegalArgumentException("Node cannot be marked both wantToDeprovision and wantToRebuild"); } if (wantToDeprovision && !wantToRetire) { throw new IllegalArgumentException("Node cannot be marked wantToDeprovision unless it's also marked wantToRetire"); } + if (wantToUpgradeFlavor && !wantToRetire) { + throw new IllegalArgumentException("Node cannot be marked wantToUpgradeFlavor unless it's also marked wantToRetire"); + } this.wantToRetire = wantToRetire; this.wantToDeprovision = wantToDeprovision; this.wantToRebuild = wantToRebuild; @@ -58,35 +64,35 @@ public class Status { } /** Returns a copy of this with the reboot generation changed */ - public Status withReboot(Generation reboot) { return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withReboot(Generation reboot) { return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } /** Returns the reboot generation of this node */ public Generation reboot() { return reboot; } /** Returns a copy of this with the vespa version changed */ - public Status withVespaVersion(Version version) { return new Status(reboot, Optional.of(version), containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withVespaVersion(Version version) { return new Status(reboot, Optional.of(version), containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } /** Returns the Vespa version installed on the node, if known */ public Optional vespaVersion() { return vespaVersion; } /** Returns a copy of this with the container image changed */ - public Status withContainerImage(DockerImage containerImage) { return new Status(reboot, vespaVersion, Optional.of(containerImage), failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withContainerImage(DockerImage containerImage) { return new Status(reboot, vespaVersion, Optional.of(containerImage), failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } /** Returns the container image the node is running, if any */ public Optional containerImage() { return containerImage; } - public Status withIncreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount + 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withIncreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount + 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } - public Status withDecreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount - 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withDecreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount - 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } - public Status withFailCount(int value) { return new Status(reboot, vespaVersion, containerImage, value, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withFailCount(int value) { return new Status(reboot, vespaVersion, containerImage, value, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } /** Returns how many times this node has been moved to the failed state. */ public int failCount() { return failCount; } /** Returns a copy of this with the want to retire/deprovision/rebuild flags changed */ - public Status withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, boolean wantToRebuild) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); + public Status withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, boolean wantToRebuild, boolean wantToUpgradeFlavor) { + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } /** @@ -109,9 +115,14 @@ public class Status { */ public boolean preferToRetire() { return preferToRetire; } + /** Returns whether the flavor of is node is required to be of the latest generation */ + public boolean wantToUpgradeFlavor() { + return wantToUpgradeFlavor; + } + /** Returns a copy of this with want to fail set to the given value */ public Status withWantToFail(boolean wantToFail) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } /** Returns whether this node should be failed */ @@ -119,12 +130,17 @@ public class Status { /** Returns a copy of this with prefer-to-retire set to given value */ public Status withPreferToRetire(boolean preferToRetire) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); + } + + /** Returns a copy of this with wantToUpgradeFlavor set to given value */ + public Status withWantToUpgradeFlavor(boolean wantToUpgradeFlavor) { + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, firmwareVerifiedAt); } /** Returns a copy of this with the OS version set to given version */ public Status withOsVersion(OsVersion version) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, version, firmwareVerifiedAt); + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, version, firmwareVerifiedAt); } /** Returns the OS version of this node */ @@ -134,7 +150,7 @@ public class Status { /** Returns a copy of this with the firmwareVerifiedAt set to the given instant. */ public Status withFirmwareVerifiedAt(Instant instant) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, Optional.of(instant)); + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, wantToUpgradeFlavor, osVersion, Optional.of(instant)); } /** Returns the last time this node had firmware that was verified to be up to date. */ @@ -145,7 +161,7 @@ public class Status { /** Returns the initial status of a newly provisioned node */ public static Status initial() { return new Status(Generation.initial(), Optional.empty(), Optional.empty(), 0, false, - false, false, false, false, OsVersion.EMPTY, Optional.empty()); + false, false, false, false, false, OsVersion.EMPTY, Optional.empty()); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index b176d0ce765..514689d3d4e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -88,7 +88,8 @@ public class NodeSerializer { private static final String wantToDeprovisionKey = "wantToDeprovision"; private static final String wantToRebuildKey = "wantToRebuild"; private static final String preferToRetireKey = "preferToRetire"; - private static final String wantToFailKey = "wantToFailKey"; + private static final String wantToFailKey = "wantToFailKey"; // TODO: This should be changed to 'wantToFail' + private static final String wantToUpgradeFlavorKey = "wantToUpgradeFlavor"; private static final String osVersionKey = "osVersion"; private static final String wantedOsVersionKey = "wantedOsVersion"; private static final String firmwareCheckKey = "firmwareCheck"; @@ -184,6 +185,7 @@ public class NodeSerializer { object.setBool(wantToDeprovisionKey, node.status().wantToDeprovision()); object.setBool(wantToFailKey, node.status().wantToFail()); object.setBool(wantToRebuildKey, node.status().wantToRebuild()); + object.setBool(wantToUpgradeFlavorKey, node.status().wantToUpgradeFlavor()); node.allocation().ifPresent(allocation -> toSlime(allocation, object.setObject(instanceKey))); toSlime(node.history().events(), object.setArray(historyKey)); toSlime(node.history().log(), object.setArray(logKey)); @@ -315,8 +317,9 @@ public class NodeSerializer { object.field(wantToRebuildKey).asBool(), object.field(preferToRetireKey).asBool(), object.field(wantToFailKey).asBool(), + object.field(wantToUpgradeFlavorKey).asBool(), new OsVersion(versionFromSlime(object.field(osVersionKey)), - versionFromSlime(object.field(wantedOsVersionKey))), + versionFromSlime(object.field(wantedOsVersionKey))), SlimeUtils.optionalInstant(object.field(firmwareCheckKey))); } @@ -488,6 +491,7 @@ public class NodeSerializer { case "SwitchRebalancer" -> Agent.SwitchRebalancer; case "HostEncrypter" -> Agent.HostEncrypter; case "ParkedExpirer" -> Agent.ParkedExpirer; + case "HostFlavorUpgrader" -> Agent.HostFlavorUpgrader; default -> throw new IllegalArgumentException("Unknown node event agent '" + eventAgentField.asString() + "'"); }; } @@ -512,6 +516,7 @@ public class NodeSerializer { case SwitchRebalancer -> "SwitchRebalancer"; case HostEncrypter -> "HostEncrypter"; case ParkedExpirer -> "ParkedExpirer"; + case HostFlavorUpgrader -> "HostFlavorUpgrader"; }; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index 3823f4906c2..07e645025c0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -120,7 +120,7 @@ public class GroupPreparer { Optional.of(cluster.type()), Optional.of(cluster.id()), requestedNodes.cloudAccount(), - false); + deficit.dueToFlavorUpgrade()); hostProvisioner.get().provisionHosts(request, whenProvisioned); } catch (NodeAllocationException e) { // Mark the nodes that were written to ZK in the consumer for deprovisioning. While these hosts do diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java index d678f4c8173..397eb4d7af9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java @@ -76,4 +76,7 @@ public interface HostProvisioner { */ List hostEventsIn(List cloudAccounts); + /** Returns whether flavor for given host can be upgraded to a newer generation */ + boolean canUpgradeFlavor(Node host, Node child); + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index e6f2758ff7f..4ef315b9322 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -74,6 +74,9 @@ class NodeAllocation { /** The number of nodes that just now was changed to retired */ private int wasRetiredJustNow = 0; + /** The number of nodes that just now was changed to retired to upgrade its host flavor */ + private int wasRetiredDueToFlavorUpgrade = 0; + /** The node indexes to verify uniqueness of each member's index */ private final Set indexes = new HashSet<>(); @@ -168,6 +171,7 @@ class NodeAllocation { if ( ! nodeResourceLimits.isWithinRealLimits(candidate, application, cluster)) return Retirement.outsideRealLimits; if (violatesParentHostPolicy(candidate)) return Retirement.violatesParentHostPolicy; if ( ! hasCompatibleResources(candidate)) return Retirement.incompatibleResources; + if (candidate.parent.map(node -> node.status().wantToUpgradeFlavor()).orElse(false)) return Retirement.violatesHostFlavorGeneration; if (candidate.wantToRetire()) return Retirement.hardRequest; if (candidate.preferToRetire() && candidate.replaceableBy(candidates)) return Retirement.softRequest; if (violatesExclusivity(candidate)) return Retirement.violatesExclusivity; @@ -271,6 +275,9 @@ class NodeAllocation { } else if (retirement != Retirement.alreadyRetired) { LOG.info("Retiring " + node + " because " + retirement.description()); ++wasRetiredJustNow; + if (retirement == Retirement.violatesHostFlavorGeneration) { + ++wasRetiredDueToFlavorUpgrade; + } node = node.retire(nodeRepository.clock().instant()); } if ( ! node.allocation().get().membership().cluster().equals(cluster)) { @@ -320,15 +327,19 @@ class NodeAllocation { /** * Returns {@link HostDeficit} describing the host deficit for the given {@link NodeSpec}. * - * @return empty if the requested spec is already fulfilled. Otherwise returns {@link HostDeficit} containing the + * @return empty if the requested spec is already fulfilled. Otherwise, returns {@link HostDeficit} containing the * flavor and host count required to cover the deficit. */ Optional hostDeficit() { if (nodeType().isHost()) { return Optional.empty(); // Hosts are provisioned as required by the child application } + int deficit = requestedNodes.fulfilledDeficitCount(accepted()); + // We can only require flavor upgrade if the entire deficit is caused by upgrades + boolean dueToFlavorUpgrade = deficit == wasRetiredDueToFlavorUpgrade; return Optional.of(new HostDeficit(requestedNodes.resources().orElseGet(NodeResources::unspecified), - requestedNodes.fulfilledDeficitCount(accepted()))) + deficit, + dueToFlavorUpgrade)) .filter(hostDeficit -> hostDeficit.count() > 0); } @@ -489,6 +500,7 @@ class NodeAllocation { softRequest("node is requested to retire"), violatesExclusivity("node violates host exclusivity"), violatesHostFlavor("node violates host flavor"), + violatesHostFlavorGeneration("node violates host flavor generation"), none(""); private final String description; @@ -505,27 +517,11 @@ class NodeAllocation { } /** A host deficit, the number of missing hosts, for a deployment */ - static class HostDeficit { - - private final NodeResources resources; - private final int count; - - private HostDeficit(NodeResources resources, int count) { - this.resources = resources; - this.count = count; - } - - NodeResources resources() { - return resources; - } - - int count() { - return count; - } + record HostDeficit(NodeResources resources, int count, boolean dueToFlavorUpgrade) { @Override public String toString() { - return "deficit of " + count + " nodes with " + resources; + return "deficit of " + count + " nodes with " + resources + (dueToFlavorUpgrade ? ", due to flavor upgrade" : ""); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java index 5d1194f247e..407961dc054 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java @@ -63,6 +63,7 @@ public class NodePatcher { private static final String WANT_TO_RETIRE = "wantToRetire"; private static final String WANT_TO_DEPROVISION = "wantToDeprovision"; private static final String WANT_TO_REBUILD = "wantToRebuild"; + private static final String WANT_TO_UPGRADE_FLAVOR = "wantToUpgradeFlavor"; private static final String REPORTS = "reports"; private static final Set RECURSIVE_FIELDS = Set.of(WANT_TO_RETIRE, WANT_TO_DEPROVISION); private static final Set IP_CONFIG_FIELDS = Set.of("ipAddresses", @@ -223,12 +224,16 @@ public class NodePatcher { case WANT_TO_DEPROVISION: case WANT_TO_REBUILD: // These needs to be handled as one, because certain combinations are not allowed. - return node.withWantToRetire(asOptionalBoolean(root.field(WANT_TO_RETIRE)).orElseGet(node.status()::wantToRetire), + return node.withWantToRetire(asOptionalBoolean(root.field(WANT_TO_RETIRE)) + .orElseGet(node.status()::wantToRetire), asOptionalBoolean(root.field(WANT_TO_DEPROVISION)) .orElseGet(node.status()::wantToDeprovision), asOptionalBoolean(root.field(WANT_TO_REBUILD)) .filter(want -> !applyingAsChild) .orElseGet(node.status()::wantToRebuild), + asOptionalBoolean(root.field(WANT_TO_UPGRADE_FLAVOR)) + .filter(want -> !applyingAsChild) + .orElseGet(node.status()::wantToUpgradeFlavor), Agent.operator, clock.instant()); case REPORTS: diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java index 779926a2c83..3d5987cd04d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java @@ -22,7 +22,6 @@ import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost; import java.time.Instant; import java.util.ArrayList; import java.util.Collections; -import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -43,10 +42,11 @@ public class MockHostProvisioner implements HostProvisioner { private final MockNameResolver nameResolver; private final int memoryTaxGb; private final Set rebuildsCompleted = new HashSet<>(); + private final Map hostFlavors = new HashMap<>(); + private final Set upgradableFlavors = new HashSet<>(); + private final Map behaviours = new HashMap<>(); private int deprovisionedHosts = 0; - private EnumSet behaviours = EnumSet.noneOf(Behaviour.class); - private Map hostFlavors = new HashMap<>(); public MockHostProvisioner(List flavors, MockNameResolver nameResolver, int memoryTaxGb) { this.flavors = List.copyOf(flavors); @@ -62,8 +62,14 @@ public class MockHostProvisioner implements HostProvisioner { this(flavors, new MockNameResolver().mockAnyLookup(), memoryTaxGb); } + /** Returns whether given behaviour is active for this invocation */ + private boolean behaviour(Behaviour behaviour) { + return behaviours.computeIfPresent(behaviour, (k, old) -> old == 0 ? null : --old) != null; + } + @Override public void provisionHosts(HostProvisionRequest request, Consumer> whenProvisioned) { + if (behaviour(Behaviour.failProvisionRequest)) throw new NodeAllocationException("No capacity for provision request", true); Flavor hostFlavor = hostFlavors.get(request.clusterType().orElse(ClusterSpec.Type.content)); if (hostFlavor == null) hostFlavor = flavors.stream() @@ -92,7 +98,7 @@ public class MockHostProvisioner implements HostProvisioner { @Override public HostIpConfig provision(Node host, Set children) throws FatalProvisioningException { - if (behaviours.contains(Behaviour.failProvisioning)) throw new FatalProvisioningException("Failed to provision node(s)"); + if (behaviour(Behaviour.failProvisioning)) throw new FatalProvisioningException("Failed to provision node(s)"); if (host.state() != Node.State.provisioned) throw new IllegalStateException("Host to provision must be in " + Node.State.provisioned); Map result = new HashMap<>(); result.put(host.hostname(), createIpConfig(host)); @@ -105,7 +111,7 @@ public class MockHostProvisioner implements HostProvisioner { @Override public void deprovision(Node host) { - if (behaviours.contains(Behaviour.failDeprovisioning)) throw new FatalProvisioningException("Failed to deprovision node"); + if (behaviour(Behaviour.failDeprovisioning)) throw new FatalProvisioningException("Failed to deprovision node"); provisionedHosts.removeIf(provisionedHost -> provisionedHost.hostHostname().equals(host.hostname())); deprovisionedHosts++; } @@ -115,7 +121,7 @@ public class MockHostProvisioner implements HostProvisioner { if (!host.type().isHost()) throw new IllegalArgumentException(host + " is not a host"); if (rebuildsCompleted.remove(host.hostname())) { return host.withWantToRetire(host.status().wantToRetire(), host.status().wantToDeprovision(), - false, Agent.system, Instant.ofEpochMilli(123)); + false, false, Agent.system, Instant.ofEpochMilli(123)); } return host; } @@ -125,6 +131,11 @@ public class MockHostProvisioner implements HostProvisioner { return Collections.unmodifiableList(hostEvents); } + @Override + public boolean canUpgradeFlavor(Node host, Node child) { + return upgradableFlavors.contains(host.flavor().name()); + } + /** Returns the hosts that have been provisioned by this */ public List provisionedHosts() { return Collections.unmodifiableList(provisionedHosts); @@ -136,14 +147,23 @@ public class MockHostProvisioner implements HostProvisioner { } public MockHostProvisioner with(Behaviour first, Behaviour... rest) { - this.behaviours = EnumSet.of(first, rest); + behaviours.put(first, Integer.MAX_VALUE); + for (var b : rest) { + behaviours.put(b, Integer.MAX_VALUE); + } + return this; + } + + public MockHostProvisioner with(Behaviour behaviour, int count) { + behaviours.put(behaviour, count); return this; } public MockHostProvisioner without(Behaviour first, Behaviour... rest) { - Set behaviours = new HashSet<>(this.behaviours); - behaviours.removeAll(EnumSet.of(first, rest)); - this.behaviours = behaviours.isEmpty() ? EnumSet.noneOf(Behaviour.class) : EnumSet.copyOf(behaviours); + behaviours.remove(first); + for (var b : rest) { + behaviours.remove(b); + } return this; } @@ -163,6 +183,11 @@ public class MockHostProvisioner implements HostProvisioner { return this; } + public MockHostProvisioner addUpgradableFlavor(String name) { + upgradableFlavors.add(name); + return this; + } + /** Sets the host flavor to use to the flavor matching these resources exactly, if any. */ public MockHostProvisioner setHostFlavorIfAvailable(NodeResources flavorAdvertisedResources, HostResourcesCalculator calculator, ClusterSpec.Type ... types) { Optional hostFlavor = flavors.stream().filter(f -> calculator.advertisedResourcesOf(f).compatibleWith(flavorAdvertisedResources)) @@ -211,7 +236,7 @@ public class MockHostProvisioner implements HostProvisioner { int hostIndex = Integer.parseInt(node.hostname().replaceAll("^[a-z]+|-\\d+$", "")); Set addresses = Set.of("::" + hostIndex + ":0"); Set ipAddressPool = new HashSet<>(); - if (!behaviours.contains(Behaviour.failDnsUpdate)) { + if (!behaviour(Behaviour.failDnsUpdate)) { nameResolver.addRecord(node.hostname(), addresses.iterator().next()); for (int i = 1; i <= 2; i++) { String ip = "::" + hostIndex + ":" + i; @@ -225,10 +250,13 @@ public class MockHostProvisioner implements HostProvisioner { public enum Behaviour { - /** Fail all calls to {@link MockHostProvisioner#provision(com.yahoo.vespa.hosted.provision.Node, java.util.Set)} */ + /** Fail call to {@link MockHostProvisioner#provision(com.yahoo.vespa.hosted.provision.Node, java.util.Set)} */ failProvisioning, - /** Fail all calls to {@link MockHostProvisioner#deprovision(com.yahoo.vespa.hosted.provision.Node)} */ + /** Fail call to {@link MockHostProvisioner#provisionHosts(HostProvisionRequest, Consumer)} */ + failProvisionRequest, + + /** Fail call to {@link MockHostProvisioner#deprovision(com.yahoo.vespa.hosted.provision.Node)} */ failDeprovisioning, /** Fail DNS updates of provisioned hosts */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index 0a614cc9b2b..b7d6e0a9dd9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -148,7 +148,7 @@ public class MockNodeRepository extends NodeRepository { nodes.add(node10); Node node55 = Node.create("node55", ipConfig(55), "host55.yahoo.com", resources(2, 8, 50, 1, fast, local), NodeType.tenant) - .status(Status.initial().withWantToRetire(true, true, false)) + .status(Status.initial().withWantToRetire(true, true, false, false)) .cloudAccount(defaultCloudAccount).build(); nodes.add(node55); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java index 7d9a48f6773..05e01c65798 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java @@ -199,7 +199,7 @@ public class NodeRepositoryTest { // Set host 2 properties and deprovision it try (var lock = tester.nodeRepository().nodes().lockAndGetRequired("host2")) { - Node host2 = lock.node().withWantToRetire(true, false, true, Agent.system, tester.nodeRepository().clock().instant()); + Node host2 = lock.node().withWantToRetire(true, false, true, false, Agent.system, tester.nodeRepository().clock().instant()); tester.nodeRepository().nodes().write(host2, lock); } tester.nodeRepository().nodes().removeRecursively("host2"); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java index ac20b9164f8..ddd7413567a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java @@ -2,15 +2,10 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.component.Version; -import com.yahoo.config.provision.Cloud; import com.yahoo.config.provision.ClusterMembership; -import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; -import com.yahoo.config.provision.RegionName; -import com.yahoo.config.provision.SystemName; -import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Allocation; @@ -37,8 +32,8 @@ public class DirtyExpirerTest { } private void assertAllocationAfterExpiry(boolean dynamicProvisioning) { - Zone zone = new Zone(Cloud.builder().dynamicProvisioning(dynamicProvisioning).build(), SystemName.main, Environment.prod, RegionName.from("us-east")); - ProvisioningTester tester = new ProvisioningTester.Builder().zone(zone) + ProvisioningTester tester = new ProvisioningTester.Builder() + .dynamicProvisioning(dynamicProvisioning, true) .hostProvisioner(dynamicProvisioning ? new MockHostProvisioner(List.of()) : null) .build(); @@ -65,4 +60,4 @@ public class DirtyExpirerTest { assertEquals(dynamicProvisioning, tester.nodeRepository().nodes().list().first().get().status().wantToDeprovision()); } -} \ No newline at end of file +} diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java index d143657310d..cfa76419262 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java @@ -756,7 +756,7 @@ public class HostCapacityMaintainerTest { parentHostname.ifPresent(builder::parentHostname); allocation.ifPresent(builder::allocation); if (hostname.equals("host2-1")) - builder.status(Status.initial().withWantToRetire(true, true, false)); + builder.status(Status.initial().withWantToRetire(true, true, false, false)); return builder.build(); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgraderTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgraderTest.java new file mode 100644 index 00000000000..964af14d8a9 --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgraderTest.java @@ -0,0 +1,85 @@ +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.Capacity; +import com.yahoo.config.provision.ClusterResources; +import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.NodeFlavors; +import com.yahoo.config.provision.NodeResources; +import com.yahoo.config.provision.NodeType; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; +import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; +import com.yahoo.vespa.hosted.provision.testutils.MockDeployer; +import com.yahoo.vespa.hosted.provision.testutils.MockHostProvisioner; +import com.yahoo.vespa.hosted.provision.testutils.MockHostProvisioner.Behaviour; +import org.junit.jupiter.api.Test; + +import java.time.Duration; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * @author mpolden + */ +class HostFlavorUpgraderTest { + + @Test + public void maintain() { + String flavor0 = "host"; + String flavor1 = "host2"; + NodeFlavors flavors = FlavorConfigBuilder.createDummies(flavor0, flavor1); + MockHostProvisioner hostProvisioner = new MockHostProvisioner(flavors.getFlavors()); + ProvisioningTester tester = new ProvisioningTester.Builder().dynamicProvisioning() + .flavors(flavors.getFlavors()) + .hostProvisioner(hostProvisioner) + .build(); + ApplicationId app = ProvisioningTester.applicationId(); + NodeResources resources = new NodeResources(4, 8, 100, 1, + NodeResources.DiskSpeed.fast, NodeResources.StorageType.remote); + ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("c1")).vespaVersion("1").build(); + Capacity capacity = Capacity.from(new ClusterResources(2, 1, resources)); + Map applications = Map.of(app, new MockDeployer.ApplicationContext(app, spec, capacity)); + MockDeployer deployer = new MockDeployer(tester.provisioner(), tester.clock(), applications); + HostFlavorUpgrader upgrader = new HostFlavorUpgrader(tester.nodeRepository(), Duration.ofDays(1), + new TestMetric(), deployer, hostProvisioner); + + // Provision hosts and deploy application + tester.makeReadyNodes(2, flavor0, NodeType.host); + tester.activateTenantHosts(); + tester.deploy(app, spec, capacity); + Node host = tester.nodeRepository().nodes().list().hosts().first().get(); + assertEquals(flavor0, host.flavor().name()); + + // Nothing to upgrade initially + upgrader.maintain(); + assertEquals(NodeList.of(), tester.nodeRepository().nodes().list() + .matching(h -> h.status().wantToUpgradeFlavor())); + + // Mark flavor as upgradable, but fail all provisioning requests + hostProvisioner.addUpgradableFlavor(flavor0) + .with(Behaviour.failProvisionRequest); + upgrader.maintain(); + assertEquals(NodeList.of(), + tester.nodeRepository().nodes().list() + .matching(node -> node.status().wantToUpgradeFlavor() || node.status().wantToRetire()), + "No hosts marked for upgrade or retirement"); + + // First provision request fails, but second succeeds and a replacement host starts provisioning + hostProvisioner.with(Behaviour.failProvisionRequest, 1); + assertEquals(-0.5, upgrader.maintain(), Double.MIN_VALUE); // One host failed, one succeeded + NodeList nodes = tester.nodeRepository().nodes().list(); + NodeList upgradingFlavor = nodes.matching(node -> node.status().wantToRetire() && + node.status().wantToUpgradeFlavor()); + assertEquals(1, upgradingFlavor.size()); + assertEquals(1, nodes.state(Node.State.provisioned).size()); + + // No more upgrades are started while host is retiring + assertEquals(1.0, upgrader.maintain(), Double.MIN_VALUE); + assertEquals(upgradingFlavor, tester.nodeRepository().nodes().list() + .matching(node -> node.status().wantToUpgradeFlavor())); + } + +} diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostRetirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostRetirerTest.java index 387a2cf5a4b..253c150f9da 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostRetirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostRetirerTest.java @@ -1,14 +1,9 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; -import com.yahoo.config.provision.Cloud; -import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.HostEvent; import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeResources; -import com.yahoo.config.provision.RegionName; -import com.yahoo.config.provision.SystemName; -import com.yahoo.config.provision.Zone; import com.yahoo.jdisc.test.MockMetric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -30,15 +25,10 @@ public class HostRetirerTest { @Test public void retire_hosts() { NodeFlavors flavors = FlavorConfigBuilder.createDummies("default"); - Zone zone = new Zone(Cloud.builder() - .dynamicProvisioning(true) - .build(), SystemName.defaultSystem(), - Environment.defaultEnvironment(), - RegionName.defaultName()); MockHostProvisioner hostProvisioner = new MockHostProvisioner(flavors.getFlavors()); ProvisioningTester tester = new ProvisioningTester.Builder().hostProvisioner(hostProvisioner) .flavors(flavors.getFlavors()) - .zone(zone) + .dynamicProvisioning() .build(); HostRetirer retirer = new HostRetirer(tester.nodeRepository(), Duration.ofDays(1), new MockMetric(), hostProvisioner); tester.makeReadyHosts(3, new NodeResources(24, 48, 1000, 10)) diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java index 40b035968bd..ada96b3f793 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java @@ -622,7 +622,7 @@ public class OsVersionsTest { Optional wantedOsVersion = node.status().osVersion().wanted(); assertFalse(node + " is not retiring", node.status().wantToRetire()); assertTrue(node + " is rebuilding", node.status().wantToRebuild()); - node = node.withWantToRetire(false, false, false, Agent.system, + node = node.withWantToRetire(false, false, false, false, Agent.system, tester.clock().instant()); return node.with(node.status().withOsVersion(node.status().osVersion().withCurrent(wantedOsVersion))); }); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java index ccb13ea5ada..1a0827fc487 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java @@ -37,7 +37,6 @@ import org.junit.Test; import java.nio.charset.StandardCharsets; import java.time.Duration; import java.time.Instant; -import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -327,14 +326,15 @@ public class NodeSerializerTest { } @Test - public void want_to_rebuild() { + public void want_to_rebuild_and_upgrade_flavor() { Node node = nodeSerializer.fromJson(nodeSerializer.toJson(createNode())); assertFalse(node.status().wantToRebuild()); - node = node.with(node.status().withWantToRetire(true, false, true)); + node = node.with(node.status().withWantToRetire(true, false, true, true)); node = nodeSerializer.fromJson(nodeSerializer.toJson(node)); assertTrue(node.status().wantToRetire()); assertFalse(node.status().wantToDeprovision()); assertTrue(node.status().wantToRebuild()); + assertTrue(node.status().wantToUpgradeFlavor()); } @Test diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTest.java index d4a911fa1ab..ced29b28d41 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTest.java @@ -3,10 +3,8 @@ package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Capacity; -import com.yahoo.config.provision.Cloud; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; -import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.HostSpec; import com.yahoo.config.provision.NodeFlavors; @@ -15,9 +13,6 @@ import com.yahoo.config.provision.NodeResources.Architecture; import com.yahoo.config.provision.NodeResources.DiskSpeed; import com.yahoo.config.provision.NodeResources.StorageType; import com.yahoo.config.provision.NodeType; -import com.yahoo.config.provision.RegionName; -import com.yahoo.config.provision.SystemName; -import com.yahoo.config.provision.Zone; import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.flags.PermanentFlags; import com.yahoo.vespa.hosted.provision.Node; @@ -250,7 +245,7 @@ public class DynamicProvisioningTest { List flavors = List.of(new Flavor("2x", new NodeResources(2, 17, 200, 10, fast, remote))); - ProvisioningTester tester = new ProvisioningTester.Builder().zone(zone(false)) + ProvisioningTester tester = new ProvisioningTester.Builder().dynamicProvisioning(true, false) .flavors(flavors) .hostProvisioner(new MockHostProvisioner(flavors, memoryTax)) .nameResolver(nameResolver) @@ -295,7 +290,8 @@ public class DynamicProvisioningTest { List flavors = List.of(new Flavor("x86", new NodeResources(2, 4, 50, 0.1, fast, local, Architecture.x86_64)), new Flavor("arm", new NodeResources(2, 4, 50, 0.1, fast, local, Architecture.arm64))); MockHostProvisioner hostProvisioner = new MockHostProvisioner(flavors); - ProvisioningTester tester = new ProvisioningTester.Builder().zone(zone(false)) + ProvisioningTester tester = new ProvisioningTester.Builder() + .dynamicProvisioning(true, false) .flavors(flavors) .hostProvisioner(hostProvisioner) .resourcesCalculator(0, 0) @@ -338,7 +334,7 @@ public class DynamicProvisioningTest { new Flavor("2x", new NodeResources(2, 20 - memoryTax, 200, 0.1, fast, remote)), new Flavor("4x", new NodeResources(4, 40 - memoryTax, 400, 0.1, fast, remote))); - ProvisioningTester tester = new ProvisioningTester.Builder().zone(zone(false)) + ProvisioningTester tester = new ProvisioningTester.Builder().dynamicProvisioning(true, false) .flavors(flavors) .hostProvisioner(new MockHostProvisioner(flavors, memoryTax)) .nameResolver(nameResolver) @@ -413,7 +409,7 @@ public class DynamicProvisioningTest { new Flavor("4x", new NodeResources(4, 40 - memoryTax, 400, 0.1, fast, remote)), new Flavor("4xl", new NodeResources(4, 40 - memoryTax, 400, 0.1, fast, local))); - ProvisioningTester tester = new ProvisioningTester.Builder().zone(zone(false)) + ProvisioningTester tester = new ProvisioningTester.Builder().dynamicProvisioning(true, false) .flavors(flavors) .hostProvisioner(new MockHostProvisioner(flavors, memoryTax)) .nameResolver(nameResolver) @@ -448,7 +444,7 @@ public class DynamicProvisioningTest { new Flavor("2xl", new NodeResources(2, 20 - memoryTax, 200, 0.1, fast, remote)), new Flavor("4xl", new NodeResources(4, 40 - memoryTax, 400, 0.1, fast, remote))); - ProvisioningTester tester = new ProvisioningTester.Builder().zone(zone(false)) + ProvisioningTester tester = new ProvisioningTester.Builder().dynamicProvisioning(true, false) .flavors(flavors) .hostProvisioner(new MockHostProvisioner(flavors, memoryTax)) .nameResolver(nameResolver) @@ -471,7 +467,7 @@ public class DynamicProvisioningTest { public void gpu_host() { List flavors = List.of(new Flavor("gpu", new NodeResources(4, 16, 125, 10, fast, local, Architecture.x86_64, new NodeResources.GpuResources(1, 16)))); - ProvisioningTester tester = new ProvisioningTester.Builder().zone(zone(false)) + ProvisioningTester tester = new ProvisioningTester.Builder().dynamicProvisioning(true, false) .flavors(flavors) .hostProvisioner(new MockHostProvisioner(flavors)) .nameResolver(nameResolver) @@ -484,17 +480,9 @@ public class DynamicProvisioningTest { 2, 1, resources); } - private Zone zone(boolean sharing) { - return new Zone( - Cloud.builder().dynamicProvisioning(true).allowHostSharing(sharing).build(), - SystemName.main, - Environment.prod, - RegionName.from("us-east")); - } - private ProvisioningTester tester(boolean sharing) { var hostProvisioner = new MockHostProvisioner(new NodeFlavors(ProvisioningTester.createConfig()).getFlavors(), nameResolver, 0); - return new ProvisioningTester.Builder().zone(zone(sharing)).hostProvisioner(hostProvisioner).nameResolver(nameResolver).build(); + return new ProvisioningTester.Builder().dynamicProvisioning(true, sharing).hostProvisioner(hostProvisioner).nameResolver(nameResolver).build(); } private void prepareAndActivate(ApplicationId application, ClusterSpec clusterSpec, int nodes, int groups, NodeResources resources, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index 76dcc6cf8a8..2acbeb00f5f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -7,9 +7,11 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ApplicationName; import com.yahoo.config.provision.ApplicationTransaction; import com.yahoo.config.provision.Capacity; +import com.yahoo.config.provision.Cloud; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.DockerImage; +import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.HostFilter; import com.yahoo.config.provision.HostSpec; @@ -21,6 +23,8 @@ import com.yahoo.config.provision.NodeResources.StorageType; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.ProvisionLock; import com.yahoo.config.provision.ProvisionLogger; +import com.yahoo.config.provision.RegionName; +import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.TenantName; import com.yahoo.config.provision.Zone; import com.yahoo.config.provisioning.FlavorsConfig; @@ -673,6 +677,20 @@ public class ProvisioningTester { return this; } + public Builder dynamicProvisioning() { + return dynamicProvisioning(true, true); + } + + public Builder dynamicProvisioning(boolean enabled, boolean allowHostSharing) { + return zone(new Zone(Cloud.builder() + .dynamicProvisioning(enabled) + .allowHostSharing(allowHostSharing) + .build(), + SystemName.defaultSystem(), + Environment.defaultEnvironment(), + RegionName.defaultName())); + } + public Builder nameResolver(NameResolver nameResolver) { this.nameResolver = nameResolver; return this; -- cgit v1.2.3 From 6ee53faf3172ed15a541717fd51bebc73cd82e5d Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Tue, 6 Jun 2023 11:34:40 +0200 Subject: Remove log message --- .../yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java index 6f30d7fedf1..eb05aa21c4e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java @@ -82,8 +82,7 @@ public class HostFlavorUpgrader extends NodeRepositoryMaintainer { redeployed = true; return asSuccessFactorDeviation(attempts, failures); } catch (NodeAllocationException e) { - log.log(Level.WARNING, "No capacity for flavor upgrade of " + parent + ". Retrying in " + - interval(), e); + // Fine, no capacity for upgrade } finally { if (deploymentValid && !redeployed) { // Cancel upgrade if redeploy failed failures++; -- cgit v1.2.3 From dc7841a19b775132b34044032d661c7213cd20a9 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Tue, 6 Jun 2023 11:36:55 +0200 Subject: Do not count lack of capacity as failures --- .../vespa/hosted/provision/maintenance/HostFlavorUpgrader.java | 6 +----- .../hosted/provision/maintenance/HostFlavorUpgraderTest.java | 8 ++++---- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java index eb05aa21c4e..b16f2c5c17e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java @@ -60,8 +60,6 @@ public class HostFlavorUpgrader extends NodeRepositoryMaintainer { NodeList activeNodes = allNodes.nodeType(NodeType.tenant) .state(Node.State.active) .shuffle(random); // Shuffle to avoid getting stuck trying to upgrade the same host - int attempts = 0; - int failures = 0; for (var node : activeNodes) { Optional parent = allNodes.parentOf(node); if (parent.isEmpty()) continue; @@ -74,18 +72,16 @@ public class HostFlavorUpgrader extends NodeRepositoryMaintainer { deploymentValid = deployment.isValid(); if (!deploymentValid) continue; - attempts++; log.log(Level.INFO, () -> "Redeploying " + node.allocation().get().owner() + " to upgrade flavor (" + parent.get().flavor().name() + ") of " + parent.get()); upgradeFlavor(parent.get(), true); deployment.activate(); redeployed = true; - return asSuccessFactorDeviation(attempts, failures); + return 1.0; } catch (NodeAllocationException e) { // Fine, no capacity for upgrade } finally { if (deploymentValid && !redeployed) { // Cancel upgrade if redeploy failed - failures++; upgradeFlavor(parent.get(), false); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgraderTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgraderTest.java index 964af14d8a9..6224143aabf 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgraderTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgraderTest.java @@ -54,14 +54,14 @@ class HostFlavorUpgraderTest { assertEquals(flavor0, host.flavor().name()); // Nothing to upgrade initially - upgrader.maintain(); + assertEquals(1, upgrader.maintain()); assertEquals(NodeList.of(), tester.nodeRepository().nodes().list() .matching(h -> h.status().wantToUpgradeFlavor())); // Mark flavor as upgradable, but fail all provisioning requests hostProvisioner.addUpgradableFlavor(flavor0) .with(Behaviour.failProvisionRequest); - upgrader.maintain(); + assertEquals(1, upgrader.maintain()); assertEquals(NodeList.of(), tester.nodeRepository().nodes().list() .matching(node -> node.status().wantToUpgradeFlavor() || node.status().wantToRetire()), @@ -69,7 +69,7 @@ class HostFlavorUpgraderTest { // First provision request fails, but second succeeds and a replacement host starts provisioning hostProvisioner.with(Behaviour.failProvisionRequest, 1); - assertEquals(-0.5, upgrader.maintain(), Double.MIN_VALUE); // One host failed, one succeeded + assertEquals(1, upgrader.maintain()); NodeList nodes = tester.nodeRepository().nodes().list(); NodeList upgradingFlavor = nodes.matching(node -> node.status().wantToRetire() && node.status().wantToUpgradeFlavor()); @@ -77,7 +77,7 @@ class HostFlavorUpgraderTest { assertEquals(1, nodes.state(Node.State.provisioned).size()); // No more upgrades are started while host is retiring - assertEquals(1.0, upgrader.maintain(), Double.MIN_VALUE); + assertEquals(1, upgrader.maintain()); assertEquals(upgradingFlavor, tester.nodeRepository().nodes().list() .matching(node -> node.status().wantToUpgradeFlavor())); } -- cgit v1.2.3