From c7206bbdc2cac293258d6e701dc15f3cb699873e Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Mon, 12 Apr 2021 21:16:22 +0200 Subject: Keep wantToRebuild flag until ready --- .../com/yahoo/vespa/hosted/provision/NodeList.java | 3 --- .../yahoo/vespa/hosted/provision/node/Nodes.java | 25 +++++++++++----------- .../vespa/hosted/provision/NodeRepositoryTest.java | 5 +++++ 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index f647130651e..522027654a1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -9,9 +9,6 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; import java.util.Comparator; import java.util.EnumSet; import java.util.List; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 3e4221ecfcd..41c7c1f5f1c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -17,7 +17,6 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer; import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter; -import com.yahoo.vespa.hosted.provision.node.filter.NodeListFilter; import com.yahoo.vespa.hosted.provision.node.filter.StateFilter; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; import com.yahoo.vespa.hosted.provision.restapi.NotFoundException; @@ -181,10 +180,13 @@ public class Nodes { .map(node -> { if (node.state() != Node.State.provisioned && node.state() != Node.State.dirty) illegal("Can not set " + node + " ready. It is not provisioned or dirty."); - return node.withWantToRetire(false, false, Agent.system, clock.instant()); + return node.withWantToRetire(false, + false, + false, + Agent.system, + clock.instant()); }) .collect(Collectors.toList()); - return db.writeTo(Node.State.ready, nodesWithResetFields, agent, Optional.of(reason)); } } @@ -201,15 +203,14 @@ public class Nodes { public Node restore(String hostname, Agent agent, String reason) { // A deprovisioned host has no children so this doesn't need to to be recursive try (NodeMutex lock = lockAndGetRequired(hostname)) { - Node existing = lock.node(); - if (existing.state() != Node.State.deprovisioned) illegal("Can not move node " + hostname + " to " + - Node.State.provisioned + ". It is not in " + - Node.State.deprovisioned); - if (!existing.status().wantToRebuild()) illegal("Can not move node " + hostname + " to " + - Node.State.provisioned + - ". Rebuild has not been requested"); - Node nodeWithResetFields = existing.withWantToRetire(false, false, false, agent, clock.instant()); - return db.writeTo(Node.State.provisioned, nodeWithResetFields, agent, Optional.of(reason)); + Node node = lock.node(); + if (node.state() != Node.State.deprovisioned) illegal("Can not move node " + hostname + " to " + + Node.State.provisioned + ". It is not in " + + Node.State.deprovisioned); + if (!node.status().wantToRebuild()) illegal("Can not move node " + hostname + " to " + + Node.State.provisioned + + ". Rebuild has not been requested"); + return db.writeTo(Node.State.provisioned, node, agent, Optional.of(reason)); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java index f9e7da9b563..1ec462b0620 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java @@ -246,6 +246,11 @@ public class NodeRepositoryTest { Node node = tester.nodeRepository().nodes().restore(host2, Agent.system, getClass().getSimpleName()); assertSame(Node.State.provisioned, node.state()); assertEquals("IP addresses are preserved", ipConfigOfHost2, node.ipConfig()); + assertTrue(node.status().wantToRetire()); + assertTrue(node.status().wantToRebuild()); + + // Readying host clears rebuild flag + node = tester.nodeRepository().nodes().setReady(host2, Agent.system, getClass().getSimpleName()); assertFalse(node.status().wantToRetire()); assertFalse(node.status().wantToRebuild()); } -- cgit v1.2.3 From 024b79b2ff4cd92f63f7dcb8e19f6d96a95ef8a9 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Mon, 12 Apr 2021 21:20:48 +0200 Subject: Extract constant --- .../com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java | 3 ++- .../main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java index af17934a878..84454e0d06a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java @@ -31,7 +31,8 @@ public class DelegatingOsUpgrader implements OsUpgrader { public DelegatingOsUpgrader(NodeRepository nodeRepository, int maxActiveUpgrades) { this.nodeRepository = Objects.requireNonNull(nodeRepository); this.maxActiveUpgrades = maxActiveUpgrades; - if (maxActiveUpgrades < 1) throw new IllegalArgumentException("maxActiveUpgrades must be positive"); + if (maxActiveUpgrades < 1) throw new IllegalArgumentException("maxActiveUpgrades must be positive, was " + + maxActiveUpgrades); } @Override diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java index 613738458c2..1366c323f1e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java @@ -30,13 +30,16 @@ public class OsVersions { private static final Logger log = Logger.getLogger(OsVersions.class.getName()); + /** The maximum number of concurrent upgrades triggered by {@link DelegatingOsUpgrader} */ + private static final int MAX_DELEGATED_UPGRADES = 30; + private final NodeRepository nodeRepository; private final CuratorDatabaseClient db; private final boolean reprovisionToUpgradeOs; private final int maxDelegatedUpgrades; public OsVersions(NodeRepository nodeRepository) { - this(nodeRepository, nodeRepository.zone().getCloud().reprovisionToUpgradeOs(), 30); + this(nodeRepository, nodeRepository.zone().getCloud().reprovisionToUpgradeOs(), MAX_DELEGATED_UPGRADES); } OsVersions(NodeRepository nodeRepository, boolean reprovisionToUpgradeOs, int maxDelegatedUpgrades) { -- cgit v1.2.3 From 76e70c163870c34217da1625688f41be56e98a86 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Mon, 12 Apr 2021 21:22:48 +0200 Subject: Fix typo --- .../yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java index 7a29414642e..856d534bbd2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java @@ -12,7 +12,7 @@ import java.time.Duration; import java.util.List; /** - * This moves nodes of type {@link NodeType#host} from preovisioned to parked if they have been in provisioned too long. + * This moves nodes of type {@link NodeType#host} from provisioned to parked if they have been in provisioned too long. * * Only {@link NodeType#host} is moved because any number of nodes of that type can exist. Other node types such as * {@link NodeType#confighost} have a fixed number and thus cannot be replaced while the fixed number of nodes exist in -- cgit v1.2.3 From 58db69342f40af1af39542e6e939048bef71b320 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Mon, 12 Apr 2021 21:27:42 +0200 Subject: Limit concurrent host rebuilds --- .../vespa/hosted/provision/os/OsVersions.java | 11 ++++-- .../hosted/provision/os/RebuildingOsUpgrader.java | 26 +++++++++++---- .../hosted/provision/os/RetiringOsUpgrader.java | 39 ++++++++++++---------- .../vespa/hosted/provision/os/OsVersionsTest.java | 19 ++++++----- 4 files changed, 60 insertions(+), 35 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java index 1366c323f1e..d3e09fbed2f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java @@ -33,20 +33,25 @@ public class OsVersions { /** The maximum number of concurrent upgrades triggered by {@link DelegatingOsUpgrader} */ private static final int MAX_DELEGATED_UPGRADES = 30; + /** The maximum number of concurrent upgrades (rebuilds) triggered by {@link RebuildingOsUpgrader} */ + private static final int MAX_REBUILDS = 3; + private final NodeRepository nodeRepository; private final CuratorDatabaseClient db; private final boolean reprovisionToUpgradeOs; private final int maxDelegatedUpgrades; + private final int maxRebuilds; public OsVersions(NodeRepository nodeRepository) { - this(nodeRepository, nodeRepository.zone().getCloud().reprovisionToUpgradeOs(), MAX_DELEGATED_UPGRADES); + this(nodeRepository, nodeRepository.zone().getCloud().reprovisionToUpgradeOs(), MAX_DELEGATED_UPGRADES, MAX_REBUILDS); } - OsVersions(NodeRepository nodeRepository, boolean reprovisionToUpgradeOs, int maxDelegatedUpgrades) { + OsVersions(NodeRepository nodeRepository, boolean reprovisionToUpgradeOs, int maxDelegatedUpgrades, int maxRebuilds) { this.nodeRepository = Objects.requireNonNull(nodeRepository); this.db = nodeRepository.database(); this.reprovisionToUpgradeOs = reprovisionToUpgradeOs; this.maxDelegatedUpgrades = maxDelegatedUpgrades; + this.maxRebuilds = maxRebuilds; // Read and write all versions to make sure they are stored in the latest version of the serialized format try (var lock = db.lockOsVersionChange()) { @@ -137,7 +142,7 @@ public class OsVersions { .anyMatch(osVersion -> osVersion.current().isPresent() && osVersion.current().get().getMajor() < target.getMajor()); if (rebuildRequired) { - return new RebuildingOsUpgrader(nodeRepository); + return new RebuildingOsUpgrader(nodeRepository, maxRebuilds); } return new DelegatingOsUpgrader(nodeRepository, maxDelegatedUpgrades); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java index 0e10e9f44de..77d0f88eb98 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java @@ -24,16 +24,28 @@ public class RebuildingOsUpgrader extends RetiringOsUpgrader { private static final Logger LOG = Logger.getLogger(RebuildingOsUpgrader.class.getName()); - public RebuildingOsUpgrader(NodeRepository nodeRepository) { + private final int maxRebuilds; + + public RebuildingOsUpgrader(NodeRepository nodeRepository, int maxRebuilds) { super(nodeRepository); + this.maxRebuilds = maxRebuilds; + if (maxRebuilds < 1) throw new IllegalArgumentException("maxRebuilds must be positive, was " + maxRebuilds); + } + + @Override + protected NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) { + if (allNodes.rebuilding().size() < maxRebuilds) { + return super.candidates(instant, target, allNodes); + } + return NodeList.of(); } - protected void upgradeNodes(NodeList activeNodes, Version version, Instant instant) { - activeNodes.osVersionIsBefore(version) - .not().rebuilding() - .byIncreasingOsVersion() - .first(1) - .forEach(node -> rebuild(node, version, instant)); + @Override + protected void upgradeNodes(NodeList candidates, Version version, Instant instant) { + candidates.not().rebuilding() + .byIncreasingOsVersion() + .first(1) + .forEach(node -> rebuild(node, version, instant)); } private void rebuild(Node host, Version target, Instant now) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java index 61d9c6b6b5d..cee52cb2177 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java @@ -33,31 +33,36 @@ public class RetiringOsUpgrader implements OsUpgrader { } @Override - public void upgradeTo(OsVersionTarget target) { + public final void upgradeTo(OsVersionTarget target) { NodeList allNodes = nodeRepository.nodes().list(); - NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType()); - if (activeNodes.isEmpty()) return; // No nodes eligible for upgrade - Instant now = nodeRepository.clock().instant(); - Duration nodeBudget = target.upgradeBudget() - .dividedBy(activeNodes.size()); - Instant retiredAt = target.lastRetiredAt().orElse(Instant.EPOCH); - if (now.isBefore(retiredAt.plus(nodeBudget))) return; // Budget has not been spent yet - - upgradeNodes(activeNodes, target.version(), now); + NodeList candidates = candidates(now, target, allNodes); + upgradeNodes(candidates, target.version(), now); } @Override - public void disableUpgrade(NodeType type) { + public final void disableUpgrade(NodeType type) { // No action needed in this implementation. } - protected void upgradeNodes(NodeList activeNodes, Version version, Instant instant) { - activeNodes.osVersionIsBefore(version) - .not().deprovisioning() - .byIncreasingOsVersion() - .first(1) - .forEach(node -> deprovision(node, version, instant)); + /** Returns nodes that are candidates for upgrade */ + protected NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) { + NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType()); + if (activeNodes.isEmpty()) return NodeList.of(); + + Duration nodeBudget = target.upgradeBudget().dividedBy(activeNodes.size()); + Instant retiredAt = target.lastRetiredAt().orElse(Instant.EPOCH); + if (instant.isBefore(retiredAt.plus(nodeBudget))) return NodeList.of(); // Budget has not been spent yet + + return activeNodes.osVersionIsBefore(target.version()); + } + + /** Trigger upgrade of candidates to given version */ + protected void upgradeNodes(NodeList candidates, Version version, Instant instant) { + candidates.not().deprovisioning() + .byIncreasingOsVersion() + .first(1) + .forEach(node -> deprovision(node, version, instant)); } /** Upgrade given host by retiring and deprovisioning it */ diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java index 8c9e52e80f5..f3e5778cb60 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java @@ -91,7 +91,7 @@ public class OsVersionsTest { public void max_active_upgrades() { int totalNodes = 20; int maxActiveUpgrades = 5; - var versions = new OsVersions(tester.nodeRepository(), false, maxActiveUpgrades); + var versions = new OsVersions(tester.nodeRepository(), false, maxActiveUpgrades, Integer.MAX_VALUE); provisionInfraApplication(totalNodes); Supplier hostNodes = () -> tester.nodeRepository().nodes().list().state(Node.State.active).hosts(); @@ -156,7 +156,7 @@ public class OsVersionsTest { @Test public void upgrade_by_retiring() { - var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE); + var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE, Integer.MAX_VALUE); var clock = (ManualClock) tester.nodeRepository().clock(); int hostCount = 10; // Provision hosts and children @@ -221,7 +221,7 @@ public class OsVersionsTest { @Test public void upgrade_by_retiring_everything_at_once() { - var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE); + var versions = new OsVersions(tester.nodeRepository(), true, Integer.MAX_VALUE, Integer.MAX_VALUE); int hostCount = 3; provisionInfraApplication(hostCount, NodeType.confighost); Supplier hostNodes = () -> tester.nodeRepository().nodes().list() @@ -244,7 +244,7 @@ public class OsVersionsTest { @Test public void upgrade_by_rebuilding() { - var versions = new OsVersions(tester.nodeRepository(), false, Integer.MAX_VALUE); + var versions = new OsVersions(tester.nodeRepository(), false, Integer.MAX_VALUE, 1); var clock = tester.clock(); int hostCount = 10; provisionInfraApplication(hostCount + 1); @@ -273,14 +273,17 @@ public class OsVersionsTest { versions.resumeUpgradeOf(NodeType.host, true); assertEquals(1, hostNodes.get().rebuilding().size()); - // Budget has been spent and another host is rebuilt + // Time budget has been spent, but we cannot rebuild another host until the current one is done clock.advance(nodeBudget); versions.resumeUpgradeOf(NodeType.host, true); NodeList hostsRebuilding = hostNodes.get().rebuilding(); - assertEquals(2, hostsRebuilding.size()); - - // Hosts are rebuilt + assertEquals(1, hostsRebuilding.size()); completeRebuildOf(hostsRebuilding.asList(), NodeType.host); + assertEquals(1, hostNodes.get().onOsVersion(version1).size()); + + // Second host is rebuilt + versions.resumeUpgradeOf(NodeType.host, true); + completeRebuildOf(hostNodes.get().rebuilding().asList(), NodeType.host); assertEquals(2, hostNodes.get().onOsVersion(version1).size()); // The remaining hosts complete their upgrade -- cgit v1.2.3