diff options
author | Jon Bratseth <bratseth@oath.com> | 2021-05-20 14:40:59 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-20 14:40:59 +0200 |
commit | 560bb94804bd1508c1f93396de55366cec7cf2c8 (patch) | |
tree | dbcd8f305128ac3669e3756046bd106c03204f3e | |
parent | b90838cbd40c42533d1bfb2b8b81a017cb030c07 (diff) | |
parent | 74097924fbb5fc3db11bd92cd929ddb1ccf53705 (diff) |
Merge pull request #17918 from vespa-engine/mpolden/limit-infra-rebuild
Limit rebuild of infrastructure clusters
2 files changed, 58 insertions, 12 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java index 71f1af09930..fce9eb562c9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java @@ -58,15 +58,21 @@ public class RebuildingOsUpgrader implements OsUpgrader { // No action needed in this implementation. Hosts that have started rebuilding cannot be halted } + /** Returns the number of hosts of given type that can be rebuilt concurrently */ + private int upgradeLimit(NodeType hostType, NodeList hosts) { + int limit = hostType == NodeType.host ? maxRebuilds.value() : 1; + return Math.max(0, limit - hosts.rebuilding().size()); + } + private List<Node> rebuildableHosts(OsVersionTarget target, NodeList allNodes) { NodeList hostsOfTargetType = allNodes.nodeType(target.nodeType()); NodeList activeHosts = hostsOfTargetType.state(Node.State.active); - int upgradeLimit = Math.max(0, maxRebuilds.value() - hostsOfTargetType.rebuilding().size()); + int upgradeLimit = upgradeLimit(target.nodeType(), hostsOfTargetType); // Find stateful clusters with retiring nodes NodeList activeNodes = allNodes.state(Node.State.active); - Set<ClusterKey> retiringClusters = statefulClustersOf(activeNodes.nodeType(target.nodeType().childNodeType()) - .retiring()); + Set<ClusterId> retiringClusters = statefulClustersOf(activeNodes.nodeType(target.nodeType().childNodeType()) + .retiring()); // Upgrade hosts not running stateful clusters that are already retiring List<Node> hostsToUpgrade = new ArrayList<>(upgradeLimit); @@ -75,7 +81,7 @@ public class RebuildingOsUpgrader implements OsUpgrader { .byIncreasingOsVersion(); for (Node host : candidates) { if (hostsToUpgrade.size() == upgradeLimit) break; - Set<ClusterKey> clustersOnHost = statefulClustersOf(activeNodes.childrenOf(host)); + Set<ClusterId> clustersOnHost = statefulClustersOf(activeNodes.childrenOf(host)); boolean canUpgrade = Collections.disjoint(retiringClusters, clustersOnHost); if (canUpgrade) { hostsToUpgrade.add(host); @@ -93,24 +99,24 @@ public class RebuildingOsUpgrader implements OsUpgrader { nodeRepository.nodes().upgradeOs(NodeListFilter.from(host), Optional.of(target)); } - private static Set<ClusterKey> statefulClustersOf(NodeList nodes) { - Set<ClusterKey> clusters = new HashSet<>(); + private static Set<ClusterId> statefulClustersOf(NodeList nodes) { + Set<ClusterId> clusters = new HashSet<>(); for (Node node : nodes) { if (node.type().isHost()) throw new IllegalArgumentException("All nodes must be children, got host " + node); if (node.allocation().isEmpty()) continue; Allocation allocation = node.allocation().get(); if (!allocation.membership().cluster().isStateful()) continue; - clusters.add(new ClusterKey(allocation.owner(), allocation.membership().cluster().id())); + clusters.add(new ClusterId(allocation.owner(), allocation.membership().cluster().id())); } return clusters; } - private static class ClusterKey { + private static class ClusterId { private final ApplicationId application; private final ClusterSpec.Id cluster; - public ClusterKey(ApplicationId application, ClusterSpec.Id cluster) { + public ClusterId(ApplicationId application, ClusterSpec.Id cluster) { this.application = Objects.requireNonNull(application); this.cluster = Objects.requireNonNull(cluster); } @@ -119,7 +125,7 @@ public class RebuildingOsUpgrader implements OsUpgrader { public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - ClusterKey that = (ClusterKey) o; + ClusterId that = (ClusterId) o; return application.equals(that.application) && cluster.equals(that.cluster); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java index 84e69585518..b3ee30e4258 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.os; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.Capacity; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.HostSpec; import com.yahoo.config.provision.NodeResources; @@ -23,6 +24,7 @@ import java.time.temporal.ChronoUnit; import java.util.Comparator; import java.util.List; import java.util.Optional; +import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; import java.util.stream.Collectors; @@ -432,6 +434,40 @@ public class OsVersionsTest { assertEquals(allHosts.size(), allHosts.onOsVersion(version1).size()); } + @Test + public void upgrade_by_rebuilding_is_limited_by_infrastructure_host() { + int hostCount = 3; + tester.flagSource().withIntFlag(PermanentFlags.MAX_REBUILDS.id(), hostCount); + var versions = new OsVersions(tester.nodeRepository(), false, Integer.MAX_VALUE); + ApplicationId routingApp = ApplicationId.from("t1", "a1", "i1"); + List<Node> proxyHosts = provisionInfraApplication(hostCount, infraApplication, NodeType.proxyhost); + for (int i = 0; i < proxyHosts.size(); i++) { + tester.makeReadyChildren(1, i, new NodeResources(4,8, 100, 0.3), + NodeType.proxy, proxyHosts.get(i).hostname(), (index) -> "proxy" + index); + } + Capacity capacity = Capacity.fromRequiredNodeType(NodeType.proxy); + tester.deploy(routingApp, capacity); + Supplier<NodeList> hosts = () -> tester.nodeRepository().nodes().list().nodeType(NodeType.proxyhost); + + // All hosts are on initial version + var version0 = Version.fromString("7.0"); + versions.setTarget(NodeType.proxyhost, version0, Duration.ZERO, false); + setCurrentVersion(hosts.get().asList(), version0); + + // Target is set for new major version + var version1 = Version.fromString("8.0"); + versions.setTarget(NodeType.proxyhost, version1, Duration.ZERO, false); + + // Upgrades 1 host at a time + for (int i = 0; i < hostCount; i++) { + versions.resumeUpgradeOf(NodeType.proxyhost, true); + List<Node> hostsRebuilding = hosts.get().rebuilding().asList(); + assertEquals(1, hostsRebuilding.size()); + replaceNodes(routingApp, (app) -> tester.deploy(app, capacity)); + completeRebuildOf(hostsRebuilding, NodeType.proxyhost); + } + } + private void deployApplication(ApplicationId application) { ClusterSpec contentSpec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("content1")).vespaVersion("7").build(); List<HostSpec> hostSpecs = tester.prepare(application, contentSpec, 2, 1, new NodeResources(4, 8, 100, 0.3)); @@ -439,14 +475,18 @@ public class OsVersionsTest { } private void replaceNodes(ApplicationId application) { + replaceNodes(application, this::deployApplication); + } + + private void replaceNodes(ApplicationId application, Consumer<ApplicationId> deployer) { // Deploy to retire nodes - deployApplication(application); + deployer.accept(application); List<Node> retired = tester.nodeRepository().nodes().list().owner(application).retired().asList(); assertFalse("At least one node is retired", retired.isEmpty()); tester.nodeRepository().nodes().setRemovable(application, retired); // Redeploy to deactivate removable nodes and allocate new ones - deployApplication(application); + deployer.accept(application); tester.nodeRepository().nodes().list(Node.State.inactive).owner(application) .forEach(node -> tester.nodeRepository().nodes().removeRecursively(node, true)); } |