aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2021-04-13 12:34:08 +0200
committerGitHub <noreply@github.com>2021-04-13 12:34:08 +0200
commitd4f8049c2e3fa2224df5d4067c9b342f9a14b8c0 (patch)
tree8c87b79efaf99685ed2f6ff7264404c618ab64a0
parentc752d3e4bcc1946358ccdee716f4141bf87748df (diff)
parent58db69342f40af1af39542e6e939048bef71b320 (diff)
Merge pull request #17390 from vespa-engine/mpolden/limit-rebuilds
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java25
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java26
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java39
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java5
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java19
9 files changed, 84 insertions, 52 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index f647130651e..522027654a1 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -9,9 +9,6 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.List;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
index 7a29414642e..856d534bbd2 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
@@ -12,7 +12,7 @@ import java.time.Duration;
import java.util.List;
/**
- * This moves nodes of type {@link NodeType#host} from preovisioned to parked if they have been in provisioned too long.
+ * This moves nodes of type {@link NodeType#host} from provisioned to parked if they have been in provisioned too long.
*
* Only {@link NodeType#host} is moved because any number of nodes of that type can exist. Other node types such as
* {@link NodeType#confighost} have a fixed number and thus cannot be replaced while the fixed number of nodes exist in
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index 3e4221ecfcd..41c7c1f5f1c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -17,7 +17,6 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer;
import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter;
-import com.yahoo.vespa.hosted.provision.node.filter.NodeListFilter;
import com.yahoo.vespa.hosted.provision.node.filter.StateFilter;
import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient;
import com.yahoo.vespa.hosted.provision.restapi.NotFoundException;
@@ -181,10 +180,13 @@ public class Nodes {
.map(node -> {
if (node.state() != Node.State.provisioned && node.state() != Node.State.dirty)
illegal("Can not set " + node + " ready. It is not provisioned or dirty.");
- return node.withWantToRetire(false, false, Agent.system, clock.instant());
+ return node.withWantToRetire(false,
+ false,
+ false,
+ Agent.system,
+ clock.instant());
})
.collect(Collectors.toList());
-
return db.writeTo(Node.State.ready, nodesWithResetFields, agent, Optional.of(reason));
}
}
@@ -201,15 +203,14 @@ public class Nodes {
public Node restore(String hostname, Agent agent, String reason) {
// A deprovisioned host has no children so this doesn't need to to be recursive
try (NodeMutex lock = lockAndGetRequired(hostname)) {
- Node existing = lock.node();
- if (existing.state() != Node.State.deprovisioned) illegal("Can not move node " + hostname + " to " +
- Node.State.provisioned + ". It is not in " +
- Node.State.deprovisioned);
- if (!existing.status().wantToRebuild()) illegal("Can not move node " + hostname + " to " +
- Node.State.provisioned +
- ". Rebuild has not been requested");
- Node nodeWithResetFields = existing.withWantToRetire(false, false, false, agent, clock.instant());
- return db.writeTo(Node.State.provisioned, nodeWithResetFields, agent, Optional.of(reason));
+ Node node = lock.node();
+ if (node.state() != Node.State.deprovisioned) illegal("Can not move node " + hostname + " to " +
+ Node.State.provisioned + ". It is not in " +
+ Node.State.deprovisioned);
+ if (!node.status().wantToRebuild()) illegal("Can not move node " + hostname + " to " +
+ Node.State.provisioned +
+ ". Rebuild has not been requested");
+ return db.writeTo(Node.State.provisioned, node, agent, Optional.of(reason));
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java
index af17934a878..84454e0d06a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java
@@ -31,7 +31,8 @@ public class DelegatingOsUpgrader implements OsUpgrader {
public DelegatingOsUpgrader(NodeRepository nodeRepository, int maxActiveUpgrades) {
this.nodeRepository = Objects.requireNonNull(nodeRepository);
this.maxActiveUpgrades = maxActiveUpgrades;
- if (maxActiveUpgrades < 1) throw new IllegalArgumentException("maxActiveUpgrades must be positive");
+ if (maxActiveUpgrades < 1) throw new IllegalArgumentException("maxActiveUpgrades must be positive, was " +
+ maxActiveUpgrades);
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java
index 613738458c2..d3e09fbed2f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java
@@ -30,20 +30,28 @@ public class OsVersions {
private static final Logger log = Logger.getLogger(OsVersions.class.getName());
+ /** The maximum number of concurrent upgrades triggered by {@link DelegatingOsUpgrader} */
+ private static final int MAX_DELEGATED_UPGRADES = 30;
+
+ /** The maximum number of concurrent upgrades (rebuilds) triggered by {@link RebuildingOsUpgrader} */
+ private static final int MAX_REBUILDS = 3;
+
private final NodeRepository nodeRepository;
private final CuratorDatabaseClient db;
private final boolean reprovisionToUpgradeOs;
private final int maxDelegatedUpgrades;
+ private final int maxRebuilds;
public OsVersions(NodeRepository nodeRepository) {
- this(nodeRepository, nodeRepository.zone().getCloud().reprovisionToUpgradeOs(), 30);
+ this(nodeRepository, nodeRepository.zone().getCloud().reprovisionToUpgradeOs(), MAX_DELEGATED_UPGRADES, MAX_REBUILDS);
}
- OsVersions(NodeRepository nodeRepository, boolean reprovisionToUpgradeOs, int maxDelegatedUpgrades) {
+ OsVersions(NodeRepository nodeRepository, boolean reprovisionToUpgradeOs, int maxDelegatedUpgrades, int maxRebuilds) {
this.nodeRepository = Objects.requireNonNull(nodeRepository);
this.db = nodeRepository.database();
this.reprovisionToUpgradeOs = reprovisionToUpgradeOs;
this.maxDelegatedUpgrades = maxDelegatedUpgrades;
+ this.maxRebuilds = maxRebuilds;
// Read and write all versions to make sure they are stored in the latest version of the serialized format
try (var lock = db.lockOsVersionChange()) {
@@ -134,7 +142,7 @@ public class OsVersions {
.anyMatch(osVersion -> osVersion.current().isPresent() &&
osVersion.current().get().getMajor() < target.getMajor());
if (rebuildRequired) {
- return new RebuildingOsUpgrader(nodeRepository);
+ return new RebuildingOsUpgrader(nodeRepository, maxRebuilds);
}
return new DelegatingOsUpgrader(nodeRepository, maxDelegatedUpgrades);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
index 0e10e9f44de..77d0f88eb98 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
@@ -24,16 +24,28 @@ public class RebuildingOsUpgrader extends RetiringOsUpgrader {
private static final Logger LOG = Logger.getLogger(RebuildingOsUpgrader.class.getName());
- public RebuildingOsUpgrader(NodeRepository nodeRepository) {
+ private final int maxRebuilds;
+
+ public RebuildingOsUpgrader(NodeRepository nodeRepository, int maxRebuilds) {
super(nodeRepository);
+ this.maxRebuilds = maxRebuilds;
+ if (maxRebuilds < 1) throw new IllegalArgumentException("maxRebuilds must be positive, was " + maxRebuilds);
+ }
+
+ @Override
+ protected NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) {
+ if (allNodes.rebuilding().size() < maxRebuilds) {
+ return super.candidates(instant, target, allNodes);
+ }
+ return NodeList.of();
}
- protected void upgradeNodes(NodeList activeNodes, Version version, Instant instant) {
- activeNodes.osVersionIsBefore(version)
- .not().rebuilding()
- .byIncreasingOsVersion()
- .first(1)
- .forEach(node -> rebuild(node, version, instant));
+ @Override
+ protected void upgradeNodes(NodeList candidates, Version version, Instant instant) {
+ candidates.not().rebuilding()
+ .byIncreasingOsVersion()
+ .first(1)
+ .forEach(node -> rebuild(node, version, instant));
}
private void rebuild(Node host, Version target, Instant now) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
index 61d9c6b6b5d..cee52cb2177 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
@@ -33,31 +33,36 @@ public class RetiringOsUpgrader implements OsUpgrader {
}
@Override
- public void upgradeTo(OsVersionTarget target) {
+ public final void upgradeTo(OsVersionTarget target) {
NodeList allNodes = nodeRepository.nodes().list();
- NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType());
- if (activeNodes.isEmpty()) return; // No nodes eligible for upgrade
-
Instant now = nodeRepository.clock().instant();
- Duration nodeBudget = target.upgradeBudget()
- .dividedBy(activeNodes.size());
- Instant retiredAt = target.lastRetiredAt().orElse(Instant.EPOCH);
- if (now.isBefore(retiredAt.plus(nodeBudget))) return; // Budget has not been spent yet
-
- upgradeNodes(activeNodes, target.version(), now);
+ NodeList candidates = candidates(now, target, allNodes);
+ upgradeNodes(candidates, target.version(), now);
}
@Override
- public void disableUpgrade(NodeType type) {
+ public final void disableUpgrade(NodeType type) {
// No action needed in this implementation.
}
- protected void upgradeNodes(NodeList activeNodes, Version version, Instant instant) {
- activeNodes.osVersionIsBefore(version)
- .not().deprovisioning()
- .byIncreasingOsVersion()
- .first(1)
- .forEach(node -> deprovision(node, version, instant));
+ /** Returns nodes that are candidates for upgrade */
+ protected NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) {
+ NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType());
+ if (activeNodes.isEmpty()) return NodeList.of();
+
+ Duration nodeBudget = target.upgradeBudget().dividedBy(activeNodes.size());
+ Instant retiredAt = target.lastRetiredAt().orElse(Instant.EPOCH);
+ if (instant.isBefore(retiredAt.plus(nodeBudget))) return NodeList.of(); // Budget has not been spent yet
+
+ return activeNodes.osVersionIsBefore(target.version());
+ }
+
+ /** Trigger upgrade of candidates to given version */
+ protected void upgradeNodes(NodeList candidates, Version version, Instant instant) {
+ candidates.not().deprovisioning()
+ .byIncreasingOsVersion()
+ .first(1)
+ .forEach(node -> deprovision(node, version, instant));
}
/** Upgrade given host by retiring and deprovisioning it */
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java
index f9e7da9b563..1ec462b0620 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java
@@ -246,6 +246,11 @@ public class NodeRepositoryTest {
Node node = tester.nodeRepository().nodes().restore(host2, Agent.system, getClass().getSimpleName());
assertSame(Node.State.provisioned, node.state());
assertEquals("IP addresses are preserved", ipConfigOfHost2, node.ipConfig());
+ assertTrue(node.status().wantToRetire());
+ assertTrue(node.status().wantToRebuild());
+
+ // Readying host clears rebuild flag
+ node = tester.nodeRepository().nodes().setReady(host2, Agent.system, getClass().getSimpleName());
assertFalse(node.status().wantToRetire());
assertFalse(node.status().wantToRebuild());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
index 8c9e52e80f5..f3e5778cb60 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
@@ -91,7 +91,7 @@ public class OsVersionsTest {
public void max_active_upgrades() {
int totalNodes = 20;
int maxActiveUpgrades = 5;
- var versions = new OsVersions(tester.nodeRepository(), false, maxActiveUpgrades);
+ var versions = new OsVersions(tester.nodeRepository(), false, maxActiveUpgrades, Integer.MAX_VALUE);
provisionInfraApplication(totalNodes);
Supplier<NodeList> hostNodes = () -> tester.nodeRepository().nodes().list().state(Node.State.active).hosts();
@@ -156,7 +156,7 @@ public class OsVersionsTest {
@Test
public void upgrade_by_retiring() {
- var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE);
+ var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE, Integer.MAX_VALUE);
var clock = (ManualClock) tester.nodeRepository().clock();
int hostCount = 10;
// Provision hosts and children
@@ -221,7 +221,7 @@ public class OsVersionsTest {
@Test
public void upgrade_by_retiring_everything_at_once() {
- var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE);
+ var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE, Integer.MAX_VALUE);
int hostCount = 3;
provisionInfraApplication(hostCount, NodeType.confighost);
Supplier<NodeList> hostNodes = () -> tester.nodeRepository().nodes().list()
@@ -244,7 +244,7 @@ public class OsVersionsTest {
@Test
public void upgrade_by_rebuilding() {
- var versions = new OsVersions(tester.nodeRepository(), false, Integer.MAX_VALUE);
+ var versions = new OsVersions(tester.nodeRepository(), false, Integer.MAX_VALUE, 1);
var clock = tester.clock();
int hostCount = 10;
provisionInfraApplication(hostCount + 1);
@@ -273,14 +273,17 @@ public class OsVersionsTest {
versions.resumeUpgradeOf(NodeType.host, true);
assertEquals(1, hostNodes.get().rebuilding().size());
- // Budget has been spent and another host is rebuilt
+ // Time budget has been spent, but we cannot rebuild another host until the current one is done
clock.advance(nodeBudget);
versions.resumeUpgradeOf(NodeType.host, true);
NodeList hostsRebuilding = hostNodes.get().rebuilding();
- assertEquals(2, hostsRebuilding.size());
-
- // Hosts are rebuilt
+ assertEquals(1, hostsRebuilding.size());
completeRebuildOf(hostsRebuilding.asList(), NodeType.host);
+ assertEquals(1, hostNodes.get().onOsVersion(version1).size());
+
+ // Second host is rebuilt
+ versions.resumeUpgradeOf(NodeType.host, true);
+ completeRebuildOf(hostNodes.get().rebuilding().asList(), NodeType.host);
assertEquals(2, hostNodes.get().onOsVersion(version1).size());
// The remaining hosts complete their upgrade