diff options
Diffstat (limited to 'node-repository/src')
13 files changed, 277 insertions, 52 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index cde7f300f2b..9cba823500b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -258,6 +258,10 @@ public final class Node implements Nodelike { if (wantToRetire == status.wantToRetire() && wantToDeprovision == status.wantToDeprovision() && wantToRebuild == status.wantToRebuild()) return this; + if (wantToRebuild && !wantToRetire && resources().storageType() != NodeResources.StorageType.remote) { + throw new IllegalArgumentException("Cannot rebuild " + this + " without retiring because storage is " + + resources().storageType()); + } Node node = this.with(status.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild)); if (wantToRetire) node = node.with(history.with(new History.Event(History.Event.Type.wantToRetire, agent, at))); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index dd4d5aa213f..58535b54a1b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -50,8 +50,13 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { } /** Returns the subset of nodes that are being rebuilt */ - public NodeList rebuilding() { - return matching(node -> node.status().wantToRetire() && node.status().wantToRebuild()); + public NodeList rebuilding(boolean soft) { + return matching(node -> { + if (soft) { + return !node.status().wantToRetire() && node.status().wantToRebuild(); + } + return node.status().wantToRetire() && node.status().wantToRebuild(); + }); } /** Returns the subset of nodes which are removable */ @@ -67,6 +72,11 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { /** Returns the subset of nodes having exactly the given resources */ public NodeList resources(NodeResources resources) { return matching(node -> node.resources().equals(resources)); } + /** Returns the subset of nodes having storage of given type */ + public NodeList storageType(NodeResources.StorageType storageType) { + return matching(node -> node.resources().storageType() == storageType); + } + /** Returns the subset of nodes which satisfy the given resources */ public NodeList satisfies(NodeResources resources) { return matching(node -> node.resources().satisfies(resources)); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 5ffadd806d5..5f43d80b87a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -79,6 +79,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { NodeList nodes = nodeRepository().nodes().list(); resumeProvisioning(nodes); convergeToCapacity(nodes); + replaceRootDisk(nodes); return 1.0; } @@ -151,6 +152,20 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { }); } + /** Replace the root disk of hosts that have requested soft-rebuild */ + private void replaceRootDisk(NodeList nodes) { + NodeList softRebuildingHosts = nodes.rebuilding(true); + for (var host : softRebuildingHosts) { + Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Optional.of(Duration.ofSeconds(10))); + try (NodeMutex mutex = optionalMutex.get()) { + Node updatedNode = hostProvisioner.replaceRootDisk(host); + if (!updatedNode.status().wantToRebuild()) { + nodeRepository().nodes().write(updatedNode, mutex); + } + } + } + } + /** * Provision hosts to ensure there is room to allocate spare nodes. * diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index d641f59eafb..ec3e2539170 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -638,30 +638,35 @@ public class Nodes { /** Retire and deprovision given host and all of its children */ public List<Node> deprovision(String hostname, Agent agent, Instant instant) { - return decommission(hostname, DecommissionOperation.deprovision, agent, instant); + return decommission(hostname, HostOperation.deprovision, agent, instant); } - /** Retire and rebuild given host and all of its children */ - public List<Node> rebuild(String hostname, Agent agent, Instant instant) { - return decommission(hostname, DecommissionOperation.rebuild, agent, instant); + /** Rebuild given host */ + public List<Node> rebuild(String hostname, boolean soft, Agent agent, Instant instant) { + return decommission(hostname, soft ? HostOperation.softRebuild : HostOperation.rebuild, agent, instant); } - private List<Node> decommission(String hostname, DecommissionOperation op, Agent agent, Instant instant) { + private List<Node> decommission(String hostname, HostOperation op, Agent agent, Instant instant) { Optional<NodeMutex> nodeMutex = lockAndGet(hostname); if (nodeMutex.isEmpty()) return List.of(); Node host = nodeMutex.get().node(); if (!host.type().isHost()) throw new IllegalArgumentException("Cannot " + op + " non-host " + host); - List<Node> result; - boolean wantToDeprovision = op == DecommissionOperation.deprovision; - boolean wantToRebuild = op == DecommissionOperation.rebuild; + + boolean wantToDeprovision = op == HostOperation.deprovision; + boolean wantToRebuild = op == HostOperation.rebuild || op == HostOperation.softRebuild; + boolean wantToRetire = op.needsRetirement(); + List<Node> result = new ArrayList<>(); try (NodeMutex lock = nodeMutex.get(); Mutex allocationLock = lockUnallocated()) { // This takes allocationLock to prevent any further allocation of nodes on this host host = lock.node(); - result = performOn(list(allocationLock).childrenOf(host), (node, nodeLock) -> { - Node newNode = node.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant); - return write(newNode, nodeLock); - }); - Node newHost = host.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant); + if (wantToRetire) { // Apply recursively if we're retiring + List<Node> updatedNodes = performOn(list(allocationLock).childrenOf(host), (node, nodeLock) -> { + Node newNode = node.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild, agent, instant); + return write(newNode, nodeLock); + }); + result.addAll(updatedNodes); + } + Node newHost = host.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild, agent, instant); result.add(write(newHost, lock)); } return result; @@ -863,10 +868,28 @@ public class Nodes { retirementRequestedByOperator; } - /** The different ways a host can be decommissioned */ - private enum DecommissionOperation { - deprovision, - rebuild, + private enum HostOperation { + + /** Host is deprovisioned and data is destroyed */ + deprovision(true), + + /** Host is deprovisioned, the same host is later re-provisioned and data is destroyed */ + rebuild(true), + + /** Host is stopped and re-bootstrapped, data is preserved */ + softRebuild(false); + + private final boolean needsRetirement; + + HostOperation(boolean needsRetirement) { + this.needsRetirement = needsRetirement; + } + + /** Returns whether this operation requires the host and its children to be retired */ + public boolean needsRetirement() { + return needsRetirement; + } + } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java index cc3f610cc44..ef0f899ca3e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java @@ -45,8 +45,8 @@ public class Status { if (wantToDeprovision && wantToRebuild) { throw new IllegalArgumentException("Node cannot be marked both wantToDeprovision and wantToRebuild"); } - if ((wantToDeprovision || wantToRebuild) && !wantToRetire) { - throw new IllegalArgumentException("Node cannot be marked wantToDeprovision or wantToRebuild unless it's also marked wantToRetire"); + if (wantToDeprovision && !wantToRetire) { + throw new IllegalArgumentException("Node cannot be marked wantToDeprovision unless it's also marked wantToRetire"); } this.wantToRetire = wantToRetire; this.wantToDeprovision = wantToDeprovision; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/CompositeOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/CompositeOsUpgrader.java new file mode 100644 index 00000000000..7aaf37a8ee6 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/CompositeOsUpgrader.java @@ -0,0 +1,28 @@ +package com.yahoo.vespa.hosted.provision.os; + +import com.yahoo.config.provision.NodeType; + +import java.util.List; + +/** + * An implementation of {@link OsUpgrader} that delegates calls to multiple implementations. + * + * @author mpolden + */ +public record CompositeOsUpgrader(List<OsUpgrader> upgraders) implements OsUpgrader { + + public CompositeOsUpgrader(List<OsUpgrader> upgraders) { + this.upgraders = List.copyOf(upgraders); + } + + @Override + public void upgradeTo(OsVersionTarget target) { + upgraders.forEach(upgrader -> upgrader.upgradeTo(target)); + } + + @Override + public void disableUpgrade(NodeType type) { + upgraders.forEach(upgrader -> upgrader.disableUpgrade(type)); + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java index 440046ab818..89fdf9d4b2a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java @@ -4,12 +4,15 @@ package com.yahoo.vespa.hosted.provision.os; import com.yahoo.component.Version; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.curator.Lock; +import com.yahoo.vespa.flags.BooleanFlag; +import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Status; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; import java.time.Duration; +import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.function.UnaryOperator; @@ -35,18 +38,20 @@ public class OsVersions { private final NodeRepository nodeRepository; private final CuratorDatabaseClient db; - private final boolean reprovisionToUpgradeOs; + private final boolean dynamicProvisioning; private final int maxDelegatedUpgrades; + private final BooleanFlag softRebuildFlag; public OsVersions(NodeRepository nodeRepository) { - this(nodeRepository, nodeRepository.zone().getCloud().reprovisionToUpgradeOs(), MAX_DELEGATED_UPGRADES); + this(nodeRepository, nodeRepository.zone().getCloud().dynamicProvisioning(), MAX_DELEGATED_UPGRADES); } - OsVersions(NodeRepository nodeRepository, boolean reprovisionToUpgradeOs, int maxDelegatedUpgrades) { + OsVersions(NodeRepository nodeRepository, boolean dynamicProvisioning, int maxDelegatedUpgrades) { this.nodeRepository = Objects.requireNonNull(nodeRepository); this.db = nodeRepository.database(); - this.reprovisionToUpgradeOs = reprovisionToUpgradeOs; + this.dynamicProvisioning = dynamicProvisioning; this.maxDelegatedUpgrades = maxDelegatedUpgrades; + this.softRebuildFlag = Flags.SOFT_REBUILD.bindTo(nodeRepository.flagSource()); // Read and write all versions to make sure they are stored in the latest version of the serialized format try (var lock = db.lockOsVersionChange()) { @@ -136,8 +141,16 @@ public class OsVersions { /** Returns the upgrader to use when upgrading given node type to target */ private OsUpgrader chooseUpgrader(NodeType nodeType, Optional<Version> target) { - if (reprovisionToUpgradeOs) { - return new RetiringOsUpgrader(nodeRepository); + if (dynamicProvisioning) { + boolean softRebuild = softRebuildFlag.value(); + RetiringOsUpgrader retiringOsUpgrader = new RetiringOsUpgrader(nodeRepository, softRebuild); + if (softRebuild) { + // If soft rebuild is enabled, we can use RebuildingOsUpgrader for hosts with remote storage. + // RetiringOsUpgrader is then only used for hosts with local storage. + return new CompositeOsUpgrader(List.of(new RebuildingOsUpgrader(nodeRepository, softRebuild), + retiringOsUpgrader)); + } + return retiringOsUpgrader; } // Require rebuild if we have any nodes of this type on a major version lower than target boolean rebuildRequired = target.isPresent() && @@ -147,7 +160,7 @@ public class OsVersions { .anyMatch(osVersion -> osVersion.current().isPresent() && osVersion.current().get().getMajor() < target.get().getMajor()); if (rebuildRequired) { - return new RebuildingOsUpgrader(nodeRepository); + return new RebuildingOsUpgrader(nodeRepository, false); } return new DelegatingOsUpgrader(nodeRepository, maxDelegatedUpgrades); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java index f96effe9e10..6b61c864a0c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.provision.os; import com.yahoo.component.Version; +import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.flags.IntFlag; import com.yahoo.vespa.flags.PermanentFlags; @@ -22,10 +23,10 @@ import java.util.Set; import java.util.logging.Logger; /** - * An upgrader that retires and rebuilds hosts on stale OS versions. + * An upgrader that rebuilds hosts on stale OS versions. * - * - We limit the number of concurrent rebuilds to reduce impact of retiring too many hosts. - * - We limit rebuilds by cluster so that at most one node per stateful cluster per application is retired at a time. + * - We limit the number of concurrent rebuilds to reduce impact of suspending or retiring too many hosts. + * - We limit rebuilds by cluster so that at most one node per stateful cluster per application is rebuilt at a time. * * Used in cases where performing an OS upgrade requires rebuilding the host, e.g. when upgrading across major versions. * @@ -37,10 +38,12 @@ public class RebuildingOsUpgrader implements OsUpgrader { private final NodeRepository nodeRepository; private final IntFlag maxRebuilds; + private final boolean softRebuild; - public RebuildingOsUpgrader(NodeRepository nodeRepository) { + public RebuildingOsUpgrader(NodeRepository nodeRepository, boolean softRebuild) { this.nodeRepository = nodeRepository; this.maxRebuilds = PermanentFlags.MAX_REBUILDS.bindTo(nodeRepository.flagSource()); + this.softRebuild = softRebuild; } @Override @@ -59,22 +62,27 @@ public class RebuildingOsUpgrader implements OsUpgrader { private int rebuildLimit(NodeType hostType, NodeList hostsOfType) { if (hostsOfType.stream().anyMatch(host -> host.type() != hostType)) illegal("All hosts must be a " + hostType); int limit = hostType == NodeType.host ? maxRebuilds.value() : 1; - return Math.max(0, limit - hostsOfType.rebuilding().size()); + return Math.max(0, limit - hostsOfType.rebuilding(softRebuild).size()); } private List<Node> rebuildableHosts(OsVersionTarget target, NodeList allNodes, Instant now) { NodeList hostsOfTargetType = allNodes.nodeType(target.nodeType()); + if (softRebuild) { + // Soft rebuild is enabled so this should only act on hosts with remote storage + hostsOfTargetType = hostsOfTargetType.storageType(NodeResources.StorageType.remote); + } int rebuildLimit = rebuildLimit(target.nodeType(), hostsOfTargetType); // Find stateful clusters with retiring nodes NodeList activeNodes = allNodes.state(Node.State.active); Set<ClusterId> retiringClusters = new HashSet<>(activeNodes.nodeType(target.nodeType().childNodeType()) - .retiring().statefulClusters()); + .retiring() + .statefulClusters()); // Rebuild hosts not containing stateful clusters with retiring nodes, up to rebuild limit List<Node> hostsToRebuild = new ArrayList<>(rebuildLimit); NodeList candidates = hostsOfTargetType.state(Node.State.active) - .not().rebuilding() + .not().rebuilding(softRebuild) .osVersionIsBefore(target.version()) .matching(node -> canUpgradeAt(now, node)) .byIncreasingOsVersion(); @@ -91,10 +99,10 @@ public class RebuildingOsUpgrader implements OsUpgrader { } private void rebuild(Node host, Version target, Instant now) { - LOG.info("Retiring and rebuilding " + host + ": On stale OS version " + + LOG.info((softRebuild ? "Soft-rebuilding " : "Retiring and rebuilding ") + host + ": On stale OS version " + host.status().osVersion().current().map(Version::toFullString).orElse("<unset>") + ", want " + target); - nodeRepository.nodes().rebuild(host.hostname(), Agent.RebuildingOsUpgrader, now); + nodeRepository.nodes().rebuild(host.hostname(), softRebuild, Agent.RebuildingOsUpgrader, now); nodeRepository.nodes().upgradeOs(NodeListFilter.from(host), Optional.of(target)); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java index 43843f6fe5a..860a17be28c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.provision.os; import com.yahoo.component.Version; +import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -28,8 +29,11 @@ public class RetiringOsUpgrader implements OsUpgrader { protected final NodeRepository nodeRepository; - public RetiringOsUpgrader(NodeRepository nodeRepository) { + private final boolean softRebuild; + + public RetiringOsUpgrader(NodeRepository nodeRepository, boolean softRebuild) { this.nodeRepository = nodeRepository; + this.softRebuild = softRebuild; } @Override @@ -57,6 +61,10 @@ public class RetiringOsUpgrader implements OsUpgrader { /** Returns nodes that are candidates for upgrade */ private NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) { NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType()); + if (softRebuild) { + // Soft rebuild is enabled, so this should only act on hosts with local storage + activeNodes = activeNodes.storageType(NodeResources.StorageType.local); + } if (activeNodes.isEmpty()) return NodeList.of(); Duration nodeBudget = target.upgradeBudget().dividedBy(activeNodes.size()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java index 567fa9098c9..9b765adca89 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java @@ -79,6 +79,12 @@ public interface HostProvisioner { */ void deprovision(Node host); + /** Replace the root (OS) disk of host. Implementations of this are expected to be idempotent. + * + * @return the updated node object + */ + Node replaceRootDisk(Node host); + /** * Returns the maintenance events scheduled for hosts in this zone, in given cloud accounts. Host events in the * zone's default cloud account are always included. diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java index 8d60dd30dd1..13753c12664 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java @@ -7,16 +7,18 @@ import com.yahoo.config.provision.CloudAccount; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.HostEvent; +import com.yahoo.config.provision.NodeAllocationException; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; -import com.yahoo.config.provision.NodeAllocationException; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.node.Address; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost; +import java.time.Instant; import java.util.ArrayList; import java.util.Collections; import java.util.EnumSet; @@ -37,6 +39,7 @@ public class MockHostProvisioner implements HostProvisioner { private final List<Flavor> flavors; private final MockNameResolver nameResolver; private final int memoryTaxGb; + private final Set<String> rebuildsCompleted = new HashSet<>(); private int deprovisionedHosts = 0; private EnumSet<Behaviour> behaviours = EnumSet.noneOf(Behaviour.class); @@ -103,6 +106,16 @@ public class MockHostProvisioner implements HostProvisioner { } @Override + public Node replaceRootDisk(Node host) { + if (!host.type().isHost()) throw new IllegalArgumentException(host + " is not a host"); + if (rebuildsCompleted.remove(host.hostname())) { + return host.withWantToRetire(host.status().wantToRetire(), host.status().wantToDeprovision(), + false, Agent.system, Instant.ofEpochMilli(123)); + } + return host; + } + + @Override public List<HostEvent> hostEventsIn(List<CloudAccount> cloudAccounts) { return Collections.unmodifiableList(hostEvents); } @@ -129,6 +142,11 @@ public class MockHostProvisioner implements HostProvisioner { return this; } + public MockHostProvisioner completeRebuildOf(Node host) { + rebuildsCompleted.add(host.hostname()); + return this; + } + public MockHostProvisioner overrideHostFlavor(String flavorName) { Flavor flavor = flavors.stream().filter(f -> f.name().equals(flavorName)) .findFirst() diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java index e5e361da379..72b49a4794a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java @@ -603,6 +603,30 @@ public class DynamicProvisioningMaintainerTest { } } + @Test + public void rebuild_host() { + var tester = new DynamicProvisioningTester(); + Node host1 = tester.addNode("host1", Optional.empty(), NodeType.host, Node.State.active); + Node host11 = tester.addNode("host1-1", Optional.of("host1"), NodeType.tenant, Node.State.parked, DynamicProvisioningTester.tenantApp); + Node host2 = tester.addNode("host2", Optional.empty(), NodeType.host, Node.State.active); + Node host21 = tester.addNode("host2-1", Optional.of("host2"), NodeType.tenant, Node.State.parked, DynamicProvisioningTester.tenantApp); + + // No rebuilds in initial run + tester.maintainer.maintain(); + assertEquals(0, tester.nodeRepository.nodes().list().rebuilding(true).size()); + + // Host starts rebuilding + tester.nodeRepository.nodes().rebuild(host1.hostname(), true, Agent.RebuildingOsUpgrader, + tester.nodeRepository.clock().instant()); + tester.maintainer.maintain(); + assertEquals(1, tester.nodeRepository.nodes().list().rebuilding(true).size()); + + // Rebuild completes + tester.hostProvisioner.completeRebuildOf(host1); + tester.maintainer.maintain(); + assertEquals(0, tester.nodeRepository.nodes().list().rebuilding(true).size()); + } + private void assertCfghost3IsActive(DynamicProvisioningTester tester) { assertEquals(5, tester.nodeRepository.nodes().list(Node.State.active).size()); assertEquals(3, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.confighost).size()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java index 3d7db9a1f96..4d75b8a5acc 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java @@ -8,6 +8,7 @@ import com.yahoo.config.provision.HostSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.test.ManualClock; +import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.PermanentFlags; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -273,35 +274,35 @@ public class OsVersionsTest { versions.resumeUpgradeOf(NodeType.host, true); // One host starts rebuilding - assertEquals(1, hostNodes.get().rebuilding().size()); + assertEquals(1, hostNodes.get().rebuilding(false).size()); // We cannot rebuild another host until the current one is done versions.resumeUpgradeOf(NodeType.host, true); - NodeList hostsRebuilding = hostNodes.get().rebuilding(); + NodeList hostsRebuilding = hostNodes.get().rebuilding(false); assertEquals(1, hostsRebuilding.size()); completeRebuildOf(hostsRebuilding.asList(), NodeType.host); assertEquals(1, hostNodes.get().onOsVersion(version1).size()); // Second host is rebuilt versions.resumeUpgradeOf(NodeType.host, true); - completeRebuildOf(hostNodes.get().rebuilding().asList(), NodeType.host); + completeRebuildOf(hostNodes.get().rebuilding(false).asList(), NodeType.host); assertEquals(2, hostNodes.get().onOsVersion(version1).size()); // The remaining hosts complete their upgrade for (int i = 0; i < hostCount - 2; i++) { versions.resumeUpgradeOf(NodeType.host, true); - hostsRebuilding = hostNodes.get().rebuilding(); + hostsRebuilding = hostNodes.get().rebuilding(false); assertEquals(1, hostsRebuilding.size()); completeRebuildOf(hostsRebuilding.asList(), NodeType.host); } // All hosts upgraded and none are rebuilding - assertEquals(hostCount, hostNodes.get().onOsVersion(version1).not().rebuilding().size()); + assertEquals(hostCount, hostNodes.get().onOsVersion(version1).not().rebuilding(false).size()); assertEquals(hostCount, tester.nodeRepository().nodes().list(Node.State.active).size()); // Resuming after everything has upgraded has no effect versions.resumeUpgradeOf(NodeType.host, true); - assertEquals(0, hostNodes.get().rebuilding().size()); + assertEquals(0, hostNodes.get().rebuilding(false).size()); // Next version is within same major. Upgrade mechanism switches to delegated var version2 = Version.fromString("8.1"); @@ -319,12 +320,62 @@ public class OsVersionsTest { // Resuming upgrades reactivated host. Upgrade mechanism switches to rebuilding versions.resumeUpgradeOf(NodeType.host, true); - hostsRebuilding = hostNodes.get().rebuilding(); + hostsRebuilding = hostNodes.get().rebuilding(false); assertEquals(List.of(reactivatedHost), hostsRebuilding.asList()); completeRebuildOf(hostsRebuilding.asList(), NodeType.host); } @Test + public void upgrade_by_soft_rebuilding() { + int maxRebuilds = 3; + int hostCount = 12; + boolean softRebuild = true; + + tester.flagSource().withIntFlag(PermanentFlags.MAX_REBUILDS.id(), maxRebuilds); + tester.flagSource().withBooleanFlag(Flags.SOFT_REBUILD.id(), softRebuild); + var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE); + + provisionInfraApplication(hostCount, infraApplication, NodeType.host, NodeResources.StorageType.remote); + Supplier<NodeList> hostNodes = () -> tester.nodeRepository().nodes().list().nodeType(NodeType.host); + + // New target is set + int hostsRebuilt = 0; + var version1 = Version.fromString("8.0"); + versions.setTarget(NodeType.host, version1, Duration.ZERO, false); + versions.resumeUpgradeOf(NodeType.host, true); + + // First batch of hosts start rebuilding + assertEquals(maxRebuilds, hostNodes.get().rebuilding(softRebuild).size()); + + // We cannot rebuild another host yet + versions.resumeUpgradeOf(NodeType.host, true); + NodeList hostsRebuilding = hostNodes.get().rebuilding(softRebuild); + assertEquals(maxRebuilds, hostsRebuilding.size()); + completeSoftRebuildOf(hostsRebuilding.asList()); + assertEquals(hostsRebuilt += maxRebuilds, hostNodes.get().onOsVersion(version1).size()); + + // Another batch is rebuilt + versions.resumeUpgradeOf(NodeType.host, true); + completeSoftRebuildOf(hostNodes.get().rebuilding(softRebuild).asList()); + assertEquals(hostsRebuilt += maxRebuilds, hostsRebuilt); + + // The remaining batches complete their upgrade + for (int i = 0; i < (hostCount - hostsRebuilt) / maxRebuilds; i++) { + versions.resumeUpgradeOf(NodeType.host, true); + hostsRebuilding = hostNodes.get().rebuilding(softRebuild); + assertEquals(maxRebuilds, hostsRebuilding.size()); + completeSoftRebuildOf(hostsRebuilding.asList()); + } + + // All hosts upgraded and none are rebuilding + assertEquals(hostCount, hostNodes.get().onOsVersion(version1).not().rebuilding(softRebuild).size()); + + // Resuming after everything has upgraded has no effect + versions.resumeUpgradeOf(NodeType.host, true); + assertEquals(0, hostNodes.get().rebuilding(softRebuild).size()); + } + + @Test public void upgrade_by_rebuilding_multiple_host_types() { tester.flagSource().withIntFlag(PermanentFlags.MAX_REBUILDS.id(), 1); var versions = new OsVersions(tester.nodeRepository(), false, Integer.MAX_VALUE); @@ -349,7 +400,7 @@ public class OsVersionsTest { for (int i = 0; i < hostCount; i++) { versions.resumeUpgradeOf(NodeType.host, true); versions.resumeUpgradeOf(NodeType.confighost, true); - NodeList hostsRebuilding = hosts.get().rebuilding(); + NodeList hostsRebuilding = hosts.get().rebuilding(false); assertEquals(2, hostsRebuilding.size()); completeRebuildOf(hostsRebuilding.nodeType(NodeType.host).asList(), NodeType.host); completeRebuildOf(hostsRebuilding.nodeType(NodeType.confighost).asList(), NodeType.confighost); @@ -382,7 +433,7 @@ public class OsVersionsTest { versions.resumeUpgradeOf(NodeType.host, true); NodeList allNodes = tester.nodeRepository().nodes().list(); List<Node> hostsRebuilding = allNodes.nodeType(NodeType.host) - .rebuilding() + .rebuilding(false) .sortedBy(Comparator.comparing(Node::hostname)) .asList(); List<Optional<ApplicationId>> owners = List.of(Optional.of(app1), Optional.of(app2), Optional.empty()); @@ -420,7 +471,7 @@ public class OsVersionsTest { // Since both applications now occupy all remaining hosts, we can only upgrade 1 at a time for (int i = 0; i < hostsOnOldVersion.size(); i++) { versions.resumeUpgradeOf(NodeType.host, true); - hostsRebuilding = hosts.get().rebuilding().asList(); + hostsRebuilding = hosts.get().rebuilding(false).asList(); assertEquals(1, hostsRebuilding.size()); replaceNodes(app1); replaceNodes(app2); @@ -430,7 +481,7 @@ public class OsVersionsTest { // Resuming upgrade has no effect as all hosts have upgraded versions.resumeUpgradeOf(NodeType.host, true); NodeList allHosts = hosts.get(); - assertEquals(0, allHosts.rebuilding().size()); + assertEquals(0, allHosts.rebuilding(false).size()); assertEquals(allHosts.size(), allHosts.onOsVersion(version1).size()); } @@ -454,7 +505,7 @@ public class OsVersionsTest { // Upgrades 1 infrastructure host at a time for (int i = 0; i < hostCount; i++) { versions.resumeUpgradeOf(NodeType.proxyhost, true); - List<Node> hostsRebuilding = hosts.get().rebuilding().asList(); + List<Node> hostsRebuilding = hosts.get().rebuilding(false).asList(); assertEquals(1, hostsRebuilding.size()); completeRebuildOf(hostsRebuilding, NodeType.proxyhost); } @@ -490,7 +541,13 @@ public class OsVersionsTest { } private List<Node> provisionInfraApplication(int nodeCount, ApplicationId application, NodeType nodeType) { - var nodes = tester.makeReadyNodes(nodeCount, new NodeResources(48, 128, 2000, 10), nodeType, 10); + return provisionInfraApplication(nodeCount, application, nodeType, NodeResources.StorageType.local); + } + + private List<Node> provisionInfraApplication(int nodeCount, ApplicationId application, NodeType nodeType, NodeResources.StorageType storageType) { + var nodes = tester.makeReadyNodes(nodeCount, new NodeResources(48, 128, 2000, 10, + NodeResources.DiskSpeed.fast, storageType), + nodeType, 10); tester.prepareAndActivateInfraApplication(application, nodeType); return nodes.stream() .map(Node::hostname) @@ -557,4 +614,15 @@ public class OsVersionsTest { }); } + private void completeSoftRebuildOf(List<Node> nodes) { + tester.patchNodes(nodes, (node) -> { + Optional<Version> wantedOsVersion = node.status().osVersion().wanted(); + assertFalse(node + " is not retiring", node.status().wantToRetire()); + assertTrue(node + " is rebuilding", node.status().wantToRebuild()); + node = node.withWantToRetire(false, false, false, Agent.system, + tester.clock().instant()); + return node.with(node.status().withOsVersion(node.status().osVersion().withCurrent(wantedOsVersion))); + }); + } + } |