summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2022-10-04 09:14:19 +0200
committerGitHub <noreply@github.com>2022-10-04 09:14:19 +0200
commitf07ab2ab209ca9983046ca488ddf86a765774eea (patch)
treeeb4e8164d035eeae90ca10f75412c68ef5939f7b
parent906b57fc1b720bd31faf25f3e355cd1e55ab7be6 (diff)
parent5eaf5a34f3c03496d053f2fabee5ca6cddb23067 (diff)
Merge pull request #24286 from vespa-engine/mpolden/soft-rebuild
Support soft-rebuilding hosts with remote disk
-rw-r--r--config-provisioning/src/main/java/com/yahoo/config/provision/Cloud.java1
-rw-r--r--flags/src/main/java/com/yahoo/vespa/flags/Flags.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java15
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java57
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/CompositeOsUpgrader.java28
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java27
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java26
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java20
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java24
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java94
16 files changed, 289 insertions, 57 deletions
diff --git a/config-provisioning/src/main/java/com/yahoo/config/provision/Cloud.java b/config-provisioning/src/main/java/com/yahoo/config/provision/Cloud.java
index a73c826c6b9..35e3a2bf5e2 100644
--- a/config-provisioning/src/main/java/com/yahoo/config/provision/Cloud.java
+++ b/config-provisioning/src/main/java/com/yahoo/config/provision/Cloud.java
@@ -34,6 +34,7 @@ public class Cloud {
}
/** Returns whether upgrading OS on hosts in this requires the host to be reprovisioned */
+ // TODO(mpolden): Unused, remove this
public boolean reprovisionToUpgradeOs() {
return reprovisionToUpgradeOs;
}
diff --git a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
index fb88ce7886a..bf9033fa7c6 100644
--- a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
+++ b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
@@ -460,6 +460,13 @@ public class Flags {
"Set mode for CSRF filter ('disabled', 'log_only', 'enabled')",
"Takes effect on controller restart/redeployment");
+ public static final UnboundBooleanFlag SOFT_REBUILD = defineFeatureFlag(
+ "soft-rebuild", false,
+ List.of("mpolden"), "2022-09-27", "2022-12-01",
+ "Whether soft rebuild can be used to rebuild hosts with remote disk",
+ "Takes effect on next run of OsUpgradeActivator"
+ );
+
public static final UnboundListFlag<String> CSRF_USERS = defineListFlag(
"csrf-users", List.of(), String.class,
List.of("bjorncs", "tokle"), "2022-09-22", "2023-06-01",
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index cde7f300f2b..9cba823500b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -258,6 +258,10 @@ public final class Node implements Nodelike {
if (wantToRetire == status.wantToRetire() &&
wantToDeprovision == status.wantToDeprovision() &&
wantToRebuild == status.wantToRebuild()) return this;
+ if (wantToRebuild && !wantToRetire && resources().storageType() != NodeResources.StorageType.remote) {
+ throw new IllegalArgumentException("Cannot rebuild " + this + " without retiring because storage is " +
+ resources().storageType());
+ }
Node node = this.with(status.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild));
if (wantToRetire)
node = node.with(history.with(new History.Event(History.Event.Type.wantToRetire, agent, at)));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index dd4d5aa213f..58535b54a1b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -50,8 +50,13 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
}
/** Returns the subset of nodes that are being rebuilt */
- public NodeList rebuilding() {
- return matching(node -> node.status().wantToRetire() && node.status().wantToRebuild());
+ public NodeList rebuilding(boolean soft) {
+ return matching(node -> {
+ if (soft) {
+ return !node.status().wantToRetire() && node.status().wantToRebuild();
+ }
+ return node.status().wantToRetire() && node.status().wantToRebuild();
+ });
}
/** Returns the subset of nodes which are removable */
@@ -67,6 +72,11 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
/** Returns the subset of nodes having exactly the given resources */
public NodeList resources(NodeResources resources) { return matching(node -> node.resources().equals(resources)); }
+ /** Returns the subset of nodes having storage of given type */
+ public NodeList storageType(NodeResources.StorageType storageType) {
+ return matching(node -> node.resources().storageType() == storageType);
+ }
+
/** Returns the subset of nodes which satisfy the given resources */
public NodeList satisfies(NodeResources resources) { return matching(node -> node.resources().satisfies(resources)); }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
index 5ffadd806d5..5f43d80b87a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
@@ -79,6 +79,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
NodeList nodes = nodeRepository().nodes().list();
resumeProvisioning(nodes);
convergeToCapacity(nodes);
+ replaceRootDisk(nodes);
return 1.0;
}
@@ -151,6 +152,20 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
});
}
+ /** Replace the root disk of hosts that have requested soft-rebuild */
+ private void replaceRootDisk(NodeList nodes) {
+ NodeList softRebuildingHosts = nodes.rebuilding(true);
+ for (var host : softRebuildingHosts) {
+ Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Optional.of(Duration.ofSeconds(10)));
+ try (NodeMutex mutex = optionalMutex.get()) {
+ Node updatedNode = hostProvisioner.replaceRootDisk(host);
+ if (!updatedNode.status().wantToRebuild()) {
+ nodeRepository().nodes().write(updatedNode, mutex);
+ }
+ }
+ }
+ }
+
/**
* Provision hosts to ensure there is room to allocate spare nodes.
*
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index d641f59eafb..ec3e2539170 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -638,30 +638,35 @@ public class Nodes {
/** Retire and deprovision given host and all of its children */
public List<Node> deprovision(String hostname, Agent agent, Instant instant) {
- return decommission(hostname, DecommissionOperation.deprovision, agent, instant);
+ return decommission(hostname, HostOperation.deprovision, agent, instant);
}
- /** Retire and rebuild given host and all of its children */
- public List<Node> rebuild(String hostname, Agent agent, Instant instant) {
- return decommission(hostname, DecommissionOperation.rebuild, agent, instant);
+ /** Rebuild given host */
+ public List<Node> rebuild(String hostname, boolean soft, Agent agent, Instant instant) {
+ return decommission(hostname, soft ? HostOperation.softRebuild : HostOperation.rebuild, agent, instant);
}
- private List<Node> decommission(String hostname, DecommissionOperation op, Agent agent, Instant instant) {
+ private List<Node> decommission(String hostname, HostOperation op, Agent agent, Instant instant) {
Optional<NodeMutex> nodeMutex = lockAndGet(hostname);
if (nodeMutex.isEmpty()) return List.of();
Node host = nodeMutex.get().node();
if (!host.type().isHost()) throw new IllegalArgumentException("Cannot " + op + " non-host " + host);
- List<Node> result;
- boolean wantToDeprovision = op == DecommissionOperation.deprovision;
- boolean wantToRebuild = op == DecommissionOperation.rebuild;
+
+ boolean wantToDeprovision = op == HostOperation.deprovision;
+ boolean wantToRebuild = op == HostOperation.rebuild || op == HostOperation.softRebuild;
+ boolean wantToRetire = op.needsRetirement();
+ List<Node> result = new ArrayList<>();
try (NodeMutex lock = nodeMutex.get(); Mutex allocationLock = lockUnallocated()) {
// This takes allocationLock to prevent any further allocation of nodes on this host
host = lock.node();
- result = performOn(list(allocationLock).childrenOf(host), (node, nodeLock) -> {
- Node newNode = node.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant);
- return write(newNode, nodeLock);
- });
- Node newHost = host.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant);
+ if (wantToRetire) { // Apply recursively if we're retiring
+ List<Node> updatedNodes = performOn(list(allocationLock).childrenOf(host), (node, nodeLock) -> {
+ Node newNode = node.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild, agent, instant);
+ return write(newNode, nodeLock);
+ });
+ result.addAll(updatedNodes);
+ }
+ Node newHost = host.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild, agent, instant);
result.add(write(newHost, lock));
}
return result;
@@ -863,10 +868,28 @@ public class Nodes {
retirementRequestedByOperator;
}
- /** The different ways a host can be decommissioned */
- private enum DecommissionOperation {
- deprovision,
- rebuild,
+ private enum HostOperation {
+
+ /** Host is deprovisioned and data is destroyed */
+ deprovision(true),
+
+ /** Host is deprovisioned, the same host is later re-provisioned and data is destroyed */
+ rebuild(true),
+
+ /** Host is stopped and re-bootstrapped, data is preserved */
+ softRebuild(false);
+
+ private final boolean needsRetirement;
+
+ HostOperation(boolean needsRetirement) {
+ this.needsRetirement = needsRetirement;
+ }
+
+ /** Returns whether this operation requires the host and its children to be retired */
+ public boolean needsRetirement() {
+ return needsRetirement;
+ }
+
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java
index cc3f610cc44..ef0f899ca3e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java
@@ -45,8 +45,8 @@ public class Status {
if (wantToDeprovision && wantToRebuild) {
throw new IllegalArgumentException("Node cannot be marked both wantToDeprovision and wantToRebuild");
}
- if ((wantToDeprovision || wantToRebuild) && !wantToRetire) {
- throw new IllegalArgumentException("Node cannot be marked wantToDeprovision or wantToRebuild unless it's also marked wantToRetire");
+ if (wantToDeprovision && !wantToRetire) {
+ throw new IllegalArgumentException("Node cannot be marked wantToDeprovision unless it's also marked wantToRetire");
}
this.wantToRetire = wantToRetire;
this.wantToDeprovision = wantToDeprovision;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/CompositeOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/CompositeOsUpgrader.java
new file mode 100644
index 00000000000..7aaf37a8ee6
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/CompositeOsUpgrader.java
@@ -0,0 +1,28 @@
+package com.yahoo.vespa.hosted.provision.os;
+
+import com.yahoo.config.provision.NodeType;
+
+import java.util.List;
+
+/**
+ * An implementation of {@link OsUpgrader} that delegates calls to multiple implementations.
+ *
+ * @author mpolden
+ */
+public record CompositeOsUpgrader(List<OsUpgrader> upgraders) implements OsUpgrader {
+
+ public CompositeOsUpgrader(List<OsUpgrader> upgraders) {
+ this.upgraders = List.copyOf(upgraders);
+ }
+
+ @Override
+ public void upgradeTo(OsVersionTarget target) {
+ upgraders.forEach(upgrader -> upgrader.upgradeTo(target));
+ }
+
+ @Override
+ public void disableUpgrade(NodeType type) {
+ upgraders.forEach(upgrader -> upgrader.disableUpgrade(type));
+ }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java
index 440046ab818..89fdf9d4b2a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java
@@ -4,12 +4,15 @@ package com.yahoo.vespa.hosted.provision.os;
import com.yahoo.component.Version;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.curator.Lock;
+import com.yahoo.vespa.flags.BooleanFlag;
+import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Status;
import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient;
import java.time.Duration;
+import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.function.UnaryOperator;
@@ -35,18 +38,20 @@ public class OsVersions {
private final NodeRepository nodeRepository;
private final CuratorDatabaseClient db;
- private final boolean reprovisionToUpgradeOs;
+ private final boolean dynamicProvisioning;
private final int maxDelegatedUpgrades;
+ private final BooleanFlag softRebuildFlag;
public OsVersions(NodeRepository nodeRepository) {
- this(nodeRepository, nodeRepository.zone().getCloud().reprovisionToUpgradeOs(), MAX_DELEGATED_UPGRADES);
+ this(nodeRepository, nodeRepository.zone().getCloud().dynamicProvisioning(), MAX_DELEGATED_UPGRADES);
}
- OsVersions(NodeRepository nodeRepository, boolean reprovisionToUpgradeOs, int maxDelegatedUpgrades) {
+ OsVersions(NodeRepository nodeRepository, boolean dynamicProvisioning, int maxDelegatedUpgrades) {
this.nodeRepository = Objects.requireNonNull(nodeRepository);
this.db = nodeRepository.database();
- this.reprovisionToUpgradeOs = reprovisionToUpgradeOs;
+ this.dynamicProvisioning = dynamicProvisioning;
this.maxDelegatedUpgrades = maxDelegatedUpgrades;
+ this.softRebuildFlag = Flags.SOFT_REBUILD.bindTo(nodeRepository.flagSource());
// Read and write all versions to make sure they are stored in the latest version of the serialized format
try (var lock = db.lockOsVersionChange()) {
@@ -136,8 +141,16 @@ public class OsVersions {
/** Returns the upgrader to use when upgrading given node type to target */
private OsUpgrader chooseUpgrader(NodeType nodeType, Optional<Version> target) {
- if (reprovisionToUpgradeOs) {
- return new RetiringOsUpgrader(nodeRepository);
+ if (dynamicProvisioning) {
+ boolean softRebuild = softRebuildFlag.value();
+ RetiringOsUpgrader retiringOsUpgrader = new RetiringOsUpgrader(nodeRepository, softRebuild);
+ if (softRebuild) {
+ // If soft rebuild is enabled, we can use RebuildingOsUpgrader for hosts with remote storage.
+ // RetiringOsUpgrader is then only used for hosts with local storage.
+ return new CompositeOsUpgrader(List.of(new RebuildingOsUpgrader(nodeRepository, softRebuild),
+ retiringOsUpgrader));
+ }
+ return retiringOsUpgrader;
}
// Require rebuild if we have any nodes of this type on a major version lower than target
boolean rebuildRequired = target.isPresent() &&
@@ -147,7 +160,7 @@ public class OsVersions {
.anyMatch(osVersion -> osVersion.current().isPresent() &&
osVersion.current().get().getMajor() < target.get().getMajor());
if (rebuildRequired) {
- return new RebuildingOsUpgrader(nodeRepository);
+ return new RebuildingOsUpgrader(nodeRepository, false);
}
return new DelegatingOsUpgrader(nodeRepository, maxDelegatedUpgrades);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
index f96effe9e10..6b61c864a0c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
@@ -2,6 +2,7 @@
package com.yahoo.vespa.hosted.provision.os;
import com.yahoo.component.Version;
+import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.flags.IntFlag;
import com.yahoo.vespa.flags.PermanentFlags;
@@ -22,10 +23,10 @@ import java.util.Set;
import java.util.logging.Logger;
/**
- * An upgrader that retires and rebuilds hosts on stale OS versions.
+ * An upgrader that rebuilds hosts on stale OS versions.
*
- * - We limit the number of concurrent rebuilds to reduce impact of retiring too many hosts.
- * - We limit rebuilds by cluster so that at most one node per stateful cluster per application is retired at a time.
+ * - We limit the number of concurrent rebuilds to reduce impact of suspending or retiring too many hosts.
+ * - We limit rebuilds by cluster so that at most one node per stateful cluster per application is rebuilt at a time.
*
* Used in cases where performing an OS upgrade requires rebuilding the host, e.g. when upgrading across major versions.
*
@@ -37,10 +38,12 @@ public class RebuildingOsUpgrader implements OsUpgrader {
private final NodeRepository nodeRepository;
private final IntFlag maxRebuilds;
+ private final boolean softRebuild;
- public RebuildingOsUpgrader(NodeRepository nodeRepository) {
+ public RebuildingOsUpgrader(NodeRepository nodeRepository, boolean softRebuild) {
this.nodeRepository = nodeRepository;
this.maxRebuilds = PermanentFlags.MAX_REBUILDS.bindTo(nodeRepository.flagSource());
+ this.softRebuild = softRebuild;
}
@Override
@@ -59,22 +62,27 @@ public class RebuildingOsUpgrader implements OsUpgrader {
private int rebuildLimit(NodeType hostType, NodeList hostsOfType) {
if (hostsOfType.stream().anyMatch(host -> host.type() != hostType)) illegal("All hosts must be a " + hostType);
int limit = hostType == NodeType.host ? maxRebuilds.value() : 1;
- return Math.max(0, limit - hostsOfType.rebuilding().size());
+ return Math.max(0, limit - hostsOfType.rebuilding(softRebuild).size());
}
private List<Node> rebuildableHosts(OsVersionTarget target, NodeList allNodes, Instant now) {
NodeList hostsOfTargetType = allNodes.nodeType(target.nodeType());
+ if (softRebuild) {
+ // Soft rebuild is enabled so this should only act on hosts with remote storage
+ hostsOfTargetType = hostsOfTargetType.storageType(NodeResources.StorageType.remote);
+ }
int rebuildLimit = rebuildLimit(target.nodeType(), hostsOfTargetType);
// Find stateful clusters with retiring nodes
NodeList activeNodes = allNodes.state(Node.State.active);
Set<ClusterId> retiringClusters = new HashSet<>(activeNodes.nodeType(target.nodeType().childNodeType())
- .retiring().statefulClusters());
+ .retiring()
+ .statefulClusters());
// Rebuild hosts not containing stateful clusters with retiring nodes, up to rebuild limit
List<Node> hostsToRebuild = new ArrayList<>(rebuildLimit);
NodeList candidates = hostsOfTargetType.state(Node.State.active)
- .not().rebuilding()
+ .not().rebuilding(softRebuild)
.osVersionIsBefore(target.version())
.matching(node -> canUpgradeAt(now, node))
.byIncreasingOsVersion();
@@ -91,10 +99,10 @@ public class RebuildingOsUpgrader implements OsUpgrader {
}
private void rebuild(Node host, Version target, Instant now) {
- LOG.info("Retiring and rebuilding " + host + ": On stale OS version " +
+ LOG.info((softRebuild ? "Soft-rebuilding " : "Retiring and rebuilding ") + host + ": On stale OS version " +
host.status().osVersion().current().map(Version::toFullString).orElse("<unset>") +
", want " + target);
- nodeRepository.nodes().rebuild(host.hostname(), Agent.RebuildingOsUpgrader, now);
+ nodeRepository.nodes().rebuild(host.hostname(), softRebuild, Agent.RebuildingOsUpgrader, now);
nodeRepository.nodes().upgradeOs(NodeListFilter.from(host), Optional.of(target));
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
index 43843f6fe5a..860a17be28c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
@@ -2,6 +2,7 @@
package com.yahoo.vespa.hosted.provision.os;
import com.yahoo.component.Version;
+import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
@@ -28,8 +29,11 @@ public class RetiringOsUpgrader implements OsUpgrader {
protected final NodeRepository nodeRepository;
- public RetiringOsUpgrader(NodeRepository nodeRepository) {
+ private final boolean softRebuild;
+
+ public RetiringOsUpgrader(NodeRepository nodeRepository, boolean softRebuild) {
this.nodeRepository = nodeRepository;
+ this.softRebuild = softRebuild;
}
@Override
@@ -57,6 +61,10 @@ public class RetiringOsUpgrader implements OsUpgrader {
/** Returns nodes that are candidates for upgrade */
private NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) {
NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType());
+ if (softRebuild) {
+ // Soft rebuild is enabled, so this should only act on hosts with local storage
+ activeNodes = activeNodes.storageType(NodeResources.StorageType.local);
+ }
if (activeNodes.isEmpty()) return NodeList.of();
Duration nodeBudget = target.upgradeBudget().dividedBy(activeNodes.size());
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
index 567fa9098c9..9b765adca89 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
@@ -79,6 +79,12 @@ public interface HostProvisioner {
*/
void deprovision(Node host);
+ /** Replace the root (OS) disk of host. Implementations of this are expected to be idempotent.
+ *
+ * @return the updated node object
+ */
+ Node replaceRootDisk(Node host);
+
/**
* Returns the maintenance events scheduled for hosts in this zone, in given cloud accounts. Host events in the
* zone's default cloud account are always included.
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
index 309280c8f15..c82cd8fb47f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
@@ -66,11 +66,10 @@ class NodesResponse extends SlimeJsonResponse {
Cursor root = slime.setObject();
switch (responseType) {
- case nodeList: nodesToSlime(filter.states(), root); break;
- case stateList : statesToSlime(root); break;
- case nodesInStateList: nodesToSlime(Set.of(NodeSerializer.stateFrom(lastElement(parentUrl))), root); break;
- case singleNode : nodeToSlime(lastElement(parentUrl), root); break;
- default: throw new IllegalArgumentException();
+ case nodeList -> nodesToSlime(filter.states(), root);
+ case stateList -> statesToSlime(root);
+ case nodesInStateList -> nodesToSlime(Set.of(NodeSerializer.stateFrom(lastElement(parentUrl))), root);
+ case singleNode -> nodeToSlime(lastElement(parentUrl), root);
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
index 8d60dd30dd1..13753c12664 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
@@ -7,16 +7,18 @@ import com.yahoo.config.provision.CloudAccount;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.HostEvent;
+import com.yahoo.config.provision.NodeAllocationException;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
-import com.yahoo.config.provision.NodeAllocationException;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.node.Address;
+import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.IP;
import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
@@ -37,6 +39,7 @@ public class MockHostProvisioner implements HostProvisioner {
private final List<Flavor> flavors;
private final MockNameResolver nameResolver;
private final int memoryTaxGb;
+ private final Set<String> rebuildsCompleted = new HashSet<>();
private int deprovisionedHosts = 0;
private EnumSet<Behaviour> behaviours = EnumSet.noneOf(Behaviour.class);
@@ -103,6 +106,16 @@ public class MockHostProvisioner implements HostProvisioner {
}
@Override
+ public Node replaceRootDisk(Node host) {
+ if (!host.type().isHost()) throw new IllegalArgumentException(host + " is not a host");
+ if (rebuildsCompleted.remove(host.hostname())) {
+ return host.withWantToRetire(host.status().wantToRetire(), host.status().wantToDeprovision(),
+ false, Agent.system, Instant.ofEpochMilli(123));
+ }
+ return host;
+ }
+
+ @Override
public List<HostEvent> hostEventsIn(List<CloudAccount> cloudAccounts) {
return Collections.unmodifiableList(hostEvents);
}
@@ -129,6 +142,11 @@ public class MockHostProvisioner implements HostProvisioner {
return this;
}
+ public MockHostProvisioner completeRebuildOf(Node host) {
+ rebuildsCompleted.add(host.hostname());
+ return this;
+ }
+
public MockHostProvisioner overrideHostFlavor(String flavorName) {
Flavor flavor = flavors.stream().filter(f -> f.name().equals(flavorName))
.findFirst()
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java
index e5e361da379..72b49a4794a 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java
@@ -603,6 +603,30 @@ public class DynamicProvisioningMaintainerTest {
}
}
+ @Test
+ public void rebuild_host() {
+ var tester = new DynamicProvisioningTester();
+ Node host1 = tester.addNode("host1", Optional.empty(), NodeType.host, Node.State.active);
+ Node host11 = tester.addNode("host1-1", Optional.of("host1"), NodeType.tenant, Node.State.parked, DynamicProvisioningTester.tenantApp);
+ Node host2 = tester.addNode("host2", Optional.empty(), NodeType.host, Node.State.active);
+ Node host21 = tester.addNode("host2-1", Optional.of("host2"), NodeType.tenant, Node.State.parked, DynamicProvisioningTester.tenantApp);
+
+ // No rebuilds in initial run
+ tester.maintainer.maintain();
+ assertEquals(0, tester.nodeRepository.nodes().list().rebuilding(true).size());
+
+ // Host starts rebuilding
+ tester.nodeRepository.nodes().rebuild(host1.hostname(), true, Agent.RebuildingOsUpgrader,
+ tester.nodeRepository.clock().instant());
+ tester.maintainer.maintain();
+ assertEquals(1, tester.nodeRepository.nodes().list().rebuilding(true).size());
+
+ // Rebuild completes
+ tester.hostProvisioner.completeRebuildOf(host1);
+ tester.maintainer.maintain();
+ assertEquals(0, tester.nodeRepository.nodes().list().rebuilding(true).size());
+ }
+
private void assertCfghost3IsActive(DynamicProvisioningTester tester) {
assertEquals(5, tester.nodeRepository.nodes().list(Node.State.active).size());
assertEquals(3, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.confighost).size());
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
index 3d7db9a1f96..4d75b8a5acc 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
@@ -8,6 +8,7 @@ import com.yahoo.config.provision.HostSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.test.ManualClock;
+import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.PermanentFlags;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
@@ -273,35 +274,35 @@ public class OsVersionsTest {
versions.resumeUpgradeOf(NodeType.host, true);
// One host starts rebuilding
- assertEquals(1, hostNodes.get().rebuilding().size());
+ assertEquals(1, hostNodes.get().rebuilding(false).size());
// We cannot rebuild another host until the current one is done
versions.resumeUpgradeOf(NodeType.host, true);
- NodeList hostsRebuilding = hostNodes.get().rebuilding();
+ NodeList hostsRebuilding = hostNodes.get().rebuilding(false);
assertEquals(1, hostsRebuilding.size());
completeRebuildOf(hostsRebuilding.asList(), NodeType.host);
assertEquals(1, hostNodes.get().onOsVersion(version1).size());
// Second host is rebuilt
versions.resumeUpgradeOf(NodeType.host, true);
- completeRebuildOf(hostNodes.get().rebuilding().asList(), NodeType.host);
+ completeRebuildOf(hostNodes.get().rebuilding(false).asList(), NodeType.host);
assertEquals(2, hostNodes.get().onOsVersion(version1).size());
// The remaining hosts complete their upgrade
for (int i = 0; i < hostCount - 2; i++) {
versions.resumeUpgradeOf(NodeType.host, true);
- hostsRebuilding = hostNodes.get().rebuilding();
+ hostsRebuilding = hostNodes.get().rebuilding(false);
assertEquals(1, hostsRebuilding.size());
completeRebuildOf(hostsRebuilding.asList(), NodeType.host);
}
// All hosts upgraded and none are rebuilding
- assertEquals(hostCount, hostNodes.get().onOsVersion(version1).not().rebuilding().size());
+ assertEquals(hostCount, hostNodes.get().onOsVersion(version1).not().rebuilding(false).size());
assertEquals(hostCount, tester.nodeRepository().nodes().list(Node.State.active).size());
// Resuming after everything has upgraded has no effect
versions.resumeUpgradeOf(NodeType.host, true);
- assertEquals(0, hostNodes.get().rebuilding().size());
+ assertEquals(0, hostNodes.get().rebuilding(false).size());
// Next version is within same major. Upgrade mechanism switches to delegated
var version2 = Version.fromString("8.1");
@@ -319,12 +320,62 @@ public class OsVersionsTest {
// Resuming upgrades reactivated host. Upgrade mechanism switches to rebuilding
versions.resumeUpgradeOf(NodeType.host, true);
- hostsRebuilding = hostNodes.get().rebuilding();
+ hostsRebuilding = hostNodes.get().rebuilding(false);
assertEquals(List.of(reactivatedHost), hostsRebuilding.asList());
completeRebuildOf(hostsRebuilding.asList(), NodeType.host);
}
@Test
+ public void upgrade_by_soft_rebuilding() {
+ int maxRebuilds = 3;
+ int hostCount = 12;
+ boolean softRebuild = true;
+
+ tester.flagSource().withIntFlag(PermanentFlags.MAX_REBUILDS.id(), maxRebuilds);
+ tester.flagSource().withBooleanFlag(Flags.SOFT_REBUILD.id(), softRebuild);
+ var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE);
+
+ provisionInfraApplication(hostCount, infraApplication, NodeType.host, NodeResources.StorageType.remote);
+ Supplier<NodeList> hostNodes = () -> tester.nodeRepository().nodes().list().nodeType(NodeType.host);
+
+ // New target is set
+ int hostsRebuilt = 0;
+ var version1 = Version.fromString("8.0");
+ versions.setTarget(NodeType.host, version1, Duration.ZERO, false);
+ versions.resumeUpgradeOf(NodeType.host, true);
+
+ // First batch of hosts start rebuilding
+ assertEquals(maxRebuilds, hostNodes.get().rebuilding(softRebuild).size());
+
+ // We cannot rebuild another host yet
+ versions.resumeUpgradeOf(NodeType.host, true);
+ NodeList hostsRebuilding = hostNodes.get().rebuilding(softRebuild);
+ assertEquals(maxRebuilds, hostsRebuilding.size());
+ completeSoftRebuildOf(hostsRebuilding.asList());
+ assertEquals(hostsRebuilt += maxRebuilds, hostNodes.get().onOsVersion(version1).size());
+
+ // Another batch is rebuilt
+ versions.resumeUpgradeOf(NodeType.host, true);
+ completeSoftRebuildOf(hostNodes.get().rebuilding(softRebuild).asList());
+ assertEquals(hostsRebuilt += maxRebuilds, hostsRebuilt);
+
+ // The remaining batches complete their upgrade
+ for (int i = 0; i < (hostCount - hostsRebuilt) / maxRebuilds; i++) {
+ versions.resumeUpgradeOf(NodeType.host, true);
+ hostsRebuilding = hostNodes.get().rebuilding(softRebuild);
+ assertEquals(maxRebuilds, hostsRebuilding.size());
+ completeSoftRebuildOf(hostsRebuilding.asList());
+ }
+
+ // All hosts upgraded and none are rebuilding
+ assertEquals(hostCount, hostNodes.get().onOsVersion(version1).not().rebuilding(softRebuild).size());
+
+ // Resuming after everything has upgraded has no effect
+ versions.resumeUpgradeOf(NodeType.host, true);
+ assertEquals(0, hostNodes.get().rebuilding(softRebuild).size());
+ }
+
+ @Test
public void upgrade_by_rebuilding_multiple_host_types() {
tester.flagSource().withIntFlag(PermanentFlags.MAX_REBUILDS.id(), 1);
var versions = new OsVersions(tester.nodeRepository(), false, Integer.MAX_VALUE);
@@ -349,7 +400,7 @@ public class OsVersionsTest {
for (int i = 0; i < hostCount; i++) {
versions.resumeUpgradeOf(NodeType.host, true);
versions.resumeUpgradeOf(NodeType.confighost, true);
- NodeList hostsRebuilding = hosts.get().rebuilding();
+ NodeList hostsRebuilding = hosts.get().rebuilding(false);
assertEquals(2, hostsRebuilding.size());
completeRebuildOf(hostsRebuilding.nodeType(NodeType.host).asList(), NodeType.host);
completeRebuildOf(hostsRebuilding.nodeType(NodeType.confighost).asList(), NodeType.confighost);
@@ -382,7 +433,7 @@ public class OsVersionsTest {
versions.resumeUpgradeOf(NodeType.host, true);
NodeList allNodes = tester.nodeRepository().nodes().list();
List<Node> hostsRebuilding = allNodes.nodeType(NodeType.host)
- .rebuilding()
+ .rebuilding(false)
.sortedBy(Comparator.comparing(Node::hostname))
.asList();
List<Optional<ApplicationId>> owners = List.of(Optional.of(app1), Optional.of(app2), Optional.empty());
@@ -420,7 +471,7 @@ public class OsVersionsTest {
// Since both applications now occupy all remaining hosts, we can only upgrade 1 at a time
for (int i = 0; i < hostsOnOldVersion.size(); i++) {
versions.resumeUpgradeOf(NodeType.host, true);
- hostsRebuilding = hosts.get().rebuilding().asList();
+ hostsRebuilding = hosts.get().rebuilding(false).asList();
assertEquals(1, hostsRebuilding.size());
replaceNodes(app1);
replaceNodes(app2);
@@ -430,7 +481,7 @@ public class OsVersionsTest {
// Resuming upgrade has no effect as all hosts have upgraded
versions.resumeUpgradeOf(NodeType.host, true);
NodeList allHosts = hosts.get();
- assertEquals(0, allHosts.rebuilding().size());
+ assertEquals(0, allHosts.rebuilding(false).size());
assertEquals(allHosts.size(), allHosts.onOsVersion(version1).size());
}
@@ -454,7 +505,7 @@ public class OsVersionsTest {
// Upgrades 1 infrastructure host at a time
for (int i = 0; i < hostCount; i++) {
versions.resumeUpgradeOf(NodeType.proxyhost, true);
- List<Node> hostsRebuilding = hosts.get().rebuilding().asList();
+ List<Node> hostsRebuilding = hosts.get().rebuilding(false).asList();
assertEquals(1, hostsRebuilding.size());
completeRebuildOf(hostsRebuilding, NodeType.proxyhost);
}
@@ -490,7 +541,13 @@ public class OsVersionsTest {
}
private List<Node> provisionInfraApplication(int nodeCount, ApplicationId application, NodeType nodeType) {
- var nodes = tester.makeReadyNodes(nodeCount, new NodeResources(48, 128, 2000, 10), nodeType, 10);
+ return provisionInfraApplication(nodeCount, application, nodeType, NodeResources.StorageType.local);
+ }
+
+ private List<Node> provisionInfraApplication(int nodeCount, ApplicationId application, NodeType nodeType, NodeResources.StorageType storageType) {
+ var nodes = tester.makeReadyNodes(nodeCount, new NodeResources(48, 128, 2000, 10,
+ NodeResources.DiskSpeed.fast, storageType),
+ nodeType, 10);
tester.prepareAndActivateInfraApplication(application, nodeType);
return nodes.stream()
.map(Node::hostname)
@@ -557,4 +614,15 @@ public class OsVersionsTest {
});
}
+ private void completeSoftRebuildOf(List<Node> nodes) {
+ tester.patchNodes(nodes, (node) -> {
+ Optional<Version> wantedOsVersion = node.status().osVersion().wanted();
+ assertFalse(node + " is not retiring", node.status().wantToRetire());
+ assertTrue(node + " is rebuilding", node.status().wantToRebuild());
+ node = node.withWantToRetire(false, false, false, Agent.system,
+ tester.clock().instant());
+ return node.with(node.status().withOsVersion(node.status().osVersion().withCurrent(wantedOsVersion)));
+ });
+ }
+
}