aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2021-05-20 14:40:59 +0200
committerGitHub <noreply@github.com>2021-05-20 14:40:59 +0200
commit560bb94804bd1508c1f93396de55366cec7cf2c8 (patch)
treedbcd8f305128ac3669e3756046bd106c03204f3e
parentb90838cbd40c42533d1bfb2b8b81a017cb030c07 (diff)
parent74097924fbb5fc3db11bd92cd929ddb1ccf53705 (diff)
Merge pull request #17918 from vespa-engine/mpolden/limit-infra-rebuild
Limit rebuild of infrastructure clusters
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java26
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java44
2 files changed, 58 insertions, 12 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
index 71f1af09930..fce9eb562c9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
@@ -58,15 +58,21 @@ public class RebuildingOsUpgrader implements OsUpgrader {
// No action needed in this implementation. Hosts that have started rebuilding cannot be halted
}
+ /** Returns the number of hosts of given type that can be rebuilt concurrently */
+ private int upgradeLimit(NodeType hostType, NodeList hosts) {
+ int limit = hostType == NodeType.host ? maxRebuilds.value() : 1;
+ return Math.max(0, limit - hosts.rebuilding().size());
+ }
+
private List<Node> rebuildableHosts(OsVersionTarget target, NodeList allNodes) {
NodeList hostsOfTargetType = allNodes.nodeType(target.nodeType());
NodeList activeHosts = hostsOfTargetType.state(Node.State.active);
- int upgradeLimit = Math.max(0, maxRebuilds.value() - hostsOfTargetType.rebuilding().size());
+ int upgradeLimit = upgradeLimit(target.nodeType(), hostsOfTargetType);
// Find stateful clusters with retiring nodes
NodeList activeNodes = allNodes.state(Node.State.active);
- Set<ClusterKey> retiringClusters = statefulClustersOf(activeNodes.nodeType(target.nodeType().childNodeType())
- .retiring());
+ Set<ClusterId> retiringClusters = statefulClustersOf(activeNodes.nodeType(target.nodeType().childNodeType())
+ .retiring());
// Upgrade hosts not running stateful clusters that are already retiring
List<Node> hostsToUpgrade = new ArrayList<>(upgradeLimit);
@@ -75,7 +81,7 @@ public class RebuildingOsUpgrader implements OsUpgrader {
.byIncreasingOsVersion();
for (Node host : candidates) {
if (hostsToUpgrade.size() == upgradeLimit) break;
- Set<ClusterKey> clustersOnHost = statefulClustersOf(activeNodes.childrenOf(host));
+ Set<ClusterId> clustersOnHost = statefulClustersOf(activeNodes.childrenOf(host));
boolean canUpgrade = Collections.disjoint(retiringClusters, clustersOnHost);
if (canUpgrade) {
hostsToUpgrade.add(host);
@@ -93,24 +99,24 @@ public class RebuildingOsUpgrader implements OsUpgrader {
nodeRepository.nodes().upgradeOs(NodeListFilter.from(host), Optional.of(target));
}
- private static Set<ClusterKey> statefulClustersOf(NodeList nodes) {
- Set<ClusterKey> clusters = new HashSet<>();
+ private static Set<ClusterId> statefulClustersOf(NodeList nodes) {
+ Set<ClusterId> clusters = new HashSet<>();
for (Node node : nodes) {
if (node.type().isHost()) throw new IllegalArgumentException("All nodes must be children, got host " + node);
if (node.allocation().isEmpty()) continue;
Allocation allocation = node.allocation().get();
if (!allocation.membership().cluster().isStateful()) continue;
- clusters.add(new ClusterKey(allocation.owner(), allocation.membership().cluster().id()));
+ clusters.add(new ClusterId(allocation.owner(), allocation.membership().cluster().id()));
}
return clusters;
}
- private static class ClusterKey {
+ private static class ClusterId {
private final ApplicationId application;
private final ClusterSpec.Id cluster;
- public ClusterKey(ApplicationId application, ClusterSpec.Id cluster) {
+ public ClusterId(ApplicationId application, ClusterSpec.Id cluster) {
this.application = Objects.requireNonNull(application);
this.cluster = Objects.requireNonNull(cluster);
}
@@ -119,7 +125,7 @@ public class RebuildingOsUpgrader implements OsUpgrader {
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
- ClusterKey that = (ClusterKey) o;
+ ClusterId that = (ClusterId) o;
return application.equals(that.application) && cluster.equals(that.cluster);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
index 84e69585518..b3ee30e4258 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.os;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.Capacity;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.HostSpec;
import com.yahoo.config.provision.NodeResources;
@@ -23,6 +24,7 @@ import java.time.temporal.ChronoUnit;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
+import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
@@ -432,6 +434,40 @@ public class OsVersionsTest {
assertEquals(allHosts.size(), allHosts.onOsVersion(version1).size());
}
+ @Test
+ public void upgrade_by_rebuilding_is_limited_by_infrastructure_host() {
+ int hostCount = 3;
+ tester.flagSource().withIntFlag(PermanentFlags.MAX_REBUILDS.id(), hostCount);
+ var versions = new OsVersions(tester.nodeRepository(), false, Integer.MAX_VALUE);
+ ApplicationId routingApp = ApplicationId.from("t1", "a1", "i1");
+ List<Node> proxyHosts = provisionInfraApplication(hostCount, infraApplication, NodeType.proxyhost);
+ for (int i = 0; i < proxyHosts.size(); i++) {
+ tester.makeReadyChildren(1, i, new NodeResources(4,8, 100, 0.3),
+ NodeType.proxy, proxyHosts.get(i).hostname(), (index) -> "proxy" + index);
+ }
+ Capacity capacity = Capacity.fromRequiredNodeType(NodeType.proxy);
+ tester.deploy(routingApp, capacity);
+ Supplier<NodeList> hosts = () -> tester.nodeRepository().nodes().list().nodeType(NodeType.proxyhost);
+
+ // All hosts are on initial version
+ var version0 = Version.fromString("7.0");
+ versions.setTarget(NodeType.proxyhost, version0, Duration.ZERO, false);
+ setCurrentVersion(hosts.get().asList(), version0);
+
+ // Target is set for new major version
+ var version1 = Version.fromString("8.0");
+ versions.setTarget(NodeType.proxyhost, version1, Duration.ZERO, false);
+
+ // Upgrades 1 host at a time
+ for (int i = 0; i < hostCount; i++) {
+ versions.resumeUpgradeOf(NodeType.proxyhost, true);
+ List<Node> hostsRebuilding = hosts.get().rebuilding().asList();
+ assertEquals(1, hostsRebuilding.size());
+ replaceNodes(routingApp, (app) -> tester.deploy(app, capacity));
+ completeRebuildOf(hostsRebuilding, NodeType.proxyhost);
+ }
+ }
+
private void deployApplication(ApplicationId application) {
ClusterSpec contentSpec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("content1")).vespaVersion("7").build();
List<HostSpec> hostSpecs = tester.prepare(application, contentSpec, 2, 1, new NodeResources(4, 8, 100, 0.3));
@@ -439,14 +475,18 @@ public class OsVersionsTest {
}
private void replaceNodes(ApplicationId application) {
+ replaceNodes(application, this::deployApplication);
+ }
+
+ private void replaceNodes(ApplicationId application, Consumer<ApplicationId> deployer) {
// Deploy to retire nodes
- deployApplication(application);
+ deployer.accept(application);
List<Node> retired = tester.nodeRepository().nodes().list().owner(application).retired().asList();
assertFalse("At least one node is retired", retired.isEmpty());
tester.nodeRepository().nodes().setRemovable(application, retired);
// Redeploy to deactivate removable nodes and allocate new ones
- deployApplication(application);
+ deployer.accept(application);
tester.nodeRepository().nodes().list(Node.State.inactive).owner(application)
.forEach(node -> tester.nodeRepository().nodes().removeRecursively(node, true));
}