aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-07-20 15:12:09 +0200
committerGitHub <noreply@github.com>2023-07-20 15:12:09 +0200
commit69d40ce3c2b9ee06b9dab98d581c07f9fe9b2973 (patch)
tree3d6bd32fec9990b59e999a8ca6a7e64c09c73a8c
parentafe1a7626a75c2d1af6b147979c45536fe7bd14c (diff)
parent052b83e68665f94c0bccbaf169e625d747b989fa (diff)
Merge pull request #27847 from vespa-engine/mpolden/os-downgrade-zone
OS downgrades at zone level
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java18
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java20
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java4
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java68
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java12
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java14
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java19
10 files changed, 133 insertions, 38 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java
index 68c79fdca7e..b1ea4584497 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java
@@ -128,8 +128,12 @@ public abstract class InfrastructureUpgrader<TARGET extends VersionTarget> exten
/** Returns whether the upgrader should expect given node to upgrade */
protected abstract boolean expectUpgradeOf(Node node, SystemApplication application, ZoneApi zone);
- /** Find the highest version used by nodes satisfying nodeSlice in zone. If no such slice exists, the lowest known version is returned */
- protected final Optional<Version> versionOf(NodeSlice nodeSlice, ZoneApi zone, SystemApplication application, Function<Node, Version> versionField) {
+ /**
+ * Find the version currently used by a slice of nodes, in given zone. If no such slice exists,
+ * the lowest (or highest, when downgrading) overall version is returned.
+ */
+ protected final Optional<Version> versionOf(NodeSlice nodeSlice, ZoneApi zone, SystemApplication application,
+ Function<Node, Version> versionField, boolean downgrading) {
try {
Map<Version, Long> nodeCountByVersion = controller().serviceRegistry().configServer()
.nodeRepository()
@@ -147,11 +151,13 @@ public abstract class InfrastructureUpgrader<TARGET extends VersionTarget> exten
}
}
if (!versionsOfMatchingSlices.isEmpty()) {
- // Choose the highest version in case we have several matching slices
- return versionsOfMatchingSlices.stream().max(Comparator.naturalOrder());
+ return downgrading
+ ? versionsOfMatchingSlices.stream().min(Comparator.naturalOrder())
+ : versionsOfMatchingSlices.stream().max(Comparator.naturalOrder());
}
- // No matching slices found, fall back to the lowest known version
- return nodeCountByVersion.keySet().stream().min(Comparator.naturalOrder());
+ return downgrading
+ ? nodeCountByVersion.keySet().stream().max(Comparator.naturalOrder())
+ : nodeCountByVersion.keySet().stream().min(Comparator.naturalOrder());
} catch (Exception e) {
throw new UnreachableNodeRepositoryException(Text.format("Failed to get version for %s in %s: %s",
application.id(), zone,
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java
index 4df40850cc9..44f0bcecf5f 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java
@@ -54,7 +54,7 @@ public class OsUpgrader extends InfrastructureUpgrader<OsVersionTarget> {
@Override
protected boolean convergedOn(OsVersionTarget target, SystemApplication application, ZoneApi zone, NodeSlice nodeSlice) {
- Version currentVersion = versionOf(nodeSlice, zone, application, Node::currentOsVersion).orElse(target.osVersion().version());
+ Version currentVersion = versionOf(nodeSlice, zone, application, Node::currentOsVersion, target.downgrade()).orElse(target.version());
return satisfiedBy(currentVersion, target);
}
@@ -67,9 +67,11 @@ public class OsUpgrader extends InfrastructureUpgrader<OsVersionTarget> {
@Override
protected Optional<OsVersionTarget> target() {
- // Return target if we have nodes in this cloud on the wrong version
+ // Return target if we have nodes in this cloud on the wrong version, or if we're downgrading a zone which does
+ // not support downgrading all nodes
return controller().os().target(cloud)
- .filter(target -> controller().os().status().nodesIn(cloud).stream()
+ .filter(target -> (target.downgrade() && !downgradingSupported()) ||
+ controller().os().status().nodesIn(cloud).stream()
.anyMatch(node -> !satisfiedBy(node.currentVersion(), target)));
}
@@ -79,19 +81,23 @@ public class OsUpgrader extends InfrastructureUpgrader<OsVersionTarget> {
return controller().serviceRegistry().configServer().nodeRepository()
.targetVersionsOf(zone.getVirtualId())
.osVersion(application.nodeType())
- .map(currentVersion -> !satisfiedBy(currentVersion, target))
+ .map(currentVersion -> !currentVersion.equals(target.version()))
.orElse(true);
}
- private static boolean satisfiedBy(Version version, OsVersionTarget target) {
- if (target.downgrade()) {
- // When downgrading we want an exact version
+ private boolean satisfiedBy(Version version, OsVersionTarget target) {
+ if (target.downgrade() && downgradingSupported()) {
+ // When downgrading we want an exact version if the cloud supports downgrades
return version.equals(target.osVersion().version());
}
// Otherwise, matching or later version is fine
return !version.isBefore(target.osVersion().version());
}
+ private boolean downgradingSupported() {
+ return !controller().zoneRegistry().zones().all().dynamicallyProvisioned().in(cloud).zones().isEmpty();
+ }
+
/** Returns whether node currently allows upgrades */
public static boolean canUpgrade(Node node, boolean includeDeferring) {
return (includeDeferring || !node.deferOsUpgrade()) && upgradableNodeStates.contains(node.state());
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java
index 000acd16155..effcc4dd4df 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java
@@ -40,7 +40,7 @@ public class SystemUpgrader extends InfrastructureUpgrader<VespaVersionTarget> {
@Override
protected boolean convergedOn(VespaVersionTarget target, SystemApplication application, ZoneApi zone, NodeSlice nodeSlice) {
- Optional<Version> currentVersion = versionOf(nodeSlice, zone, application, Node::currentVersion);
+ Optional<Version> currentVersion = versionOf(nodeSlice, zone, application, Node::currentVersion, target.downgrade());
// Skip application convergence check if there are no nodes belonging to the application in the zone
if (currentVersion.isEmpty()) return true;
@@ -78,7 +78,7 @@ public class SystemUpgrader extends InfrastructureUpgrader<VespaVersionTarget> {
// For applications with package we do not have a zone-wide version target. This means that we must check
// the wanted version of each node.
boolean zoneHasSharedRouting = controller().zoneRegistry().routingMethod(zone.getId()).isShared();
- return versionOf(NodeSlice.ALL, zone, application, Node::wantedVersion)
+ return versionOf(NodeSlice.ALL, zone, application, Node::wantedVersion, target.downgrade())
.map(wantedVersion -> !wantedVersion.equals(target.version()))
.orElse(zoneHasSharedRouting); // Always upgrade if zone uses shared routing, but has no nodes allocated yet
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java
index 056db4f119c..fad78edc58f 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java
@@ -18,10 +18,14 @@ import com.yahoo.vespa.hosted.controller.versions.NodeVersion;
import org.junit.jupiter.api.Test;
import java.time.Duration;
+import java.util.ArrayList;
import java.util.Collection;
+import java.util.Collections;
import java.util.List;
+import java.util.Map;
import java.util.function.Function;
import java.util.function.UnaryOperator;
+import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -198,7 +202,8 @@ public class OsUpgraderTest {
statusUpdater.maintain();
// All zones upgrade
- for (var zone : List.of(zone1, zone2)) {
+ List<ZoneApi> zones = new ArrayList<>(List.of(zone1, zone2));
+ for (var zone : zones) {
osUpgrader.maintain();
completeUpgrade(version1, SystemApplication.tenantHost, zone);
statusUpdater.maintain();
@@ -208,9 +213,17 @@ public class OsUpgraderTest {
// Downgrade is triggered
tester.controller().os().upgradeTo(version0, cloud, true, false);
+ // Zone order is reversed
+ Collections.reverse(zones);
- // All zones downgrade, in reverse order
- for (var zone : List.of(zone2, zone1)) {
+ // One host in first zone downgrades. Wanted version is not changed for second zone yet
+ osUpgrader.maintain();
+ completeUpgrade(1, version0, SystemApplication.tenantHost, zones.get(0));
+ osUpgrader.maintain();
+ assertWanted(version1, SystemApplication.tenantHost, zones.get(1));
+
+ // All zones downgrade
+ for (var zone : zones) {
osUpgrader.maintain();
completeUpgrade(version0, SystemApplication.tenantHost, zone);
statusUpdater.maintain();
@@ -219,6 +232,55 @@ public class OsUpgraderTest {
.allMatch(node -> node.currentVersion().equals(version0)), "All nodes on target version");
}
+ @Test
+ public void downgrade_os_partially() {
+ CloudName cloud = CloudName.from("cloud");
+ ZoneApi zone1 = zone("dev.us-east-1", cloud);
+ ZoneApi zone2 = zone("prod.us-west-1", cloud);
+ UpgradePolicy upgradePolicy = UpgradePolicy.builder()
+ .upgrade(zone1)
+ .upgrade(zone2)
+ .build();
+ OsUpgrader osUpgrader = osUpgrader(upgradePolicy, cloud, false);
+
+ // Bootstrap system
+ tester.configServer().bootstrap(List.of(zone1.getId(), zone2.getId()),
+ List.of(SystemApplication.tenantHost));
+
+ // New OS version released
+ Version version0 = Version.fromString("1.0");
+ Version version1 = Version.fromString("2.0");
+ tester.controller().os().upgradeTo(version1, cloud, false, false);
+ statusUpdater.maintain();
+
+ // All zones upgrade
+ for (var zone : List.of(zone1, zone2)) {
+ osUpgrader.maintain();
+ completeUpgrade(version1, SystemApplication.tenantHost, zone);
+ statusUpdater.maintain();
+ }
+ assertTrue(tester.controller().os().status().nodesIn(cloud).stream()
+ .allMatch(node -> node.currentVersion().equals(version1)), "All nodes on target version");
+
+ // Downgrade is triggered
+ tester.controller().os().upgradeTo(version0, cloud, true, false);
+
+ // All zones downgrade, in reverse order
+ for (var zone : List.of(zone2, zone1)) {
+ osUpgrader.maintain();
+ // Partial downgrading happens, as this decision is left up to the zone. Downgrade target is still set in
+ // all zones as a best-effort, and to halt any further upgrades
+ completeUpgrade(1, version0, SystemApplication.tenantHost, zone);
+ statusUpdater.maintain();
+ }
+ int zoneCount = 2;
+ Map<Version, Long> currentVersions = tester.controller().os().status().nodesIn(cloud).stream()
+ .collect(Collectors.groupingBy(NodeVersion::currentVersion,
+ Collectors.counting()));
+ assertEquals(1 * zoneCount, currentVersions.get(version0));
+ assertEquals(2 * zoneCount, currentVersions.get(version1));
+ }
+
private List<NodeVersion> nodesOn(Version version) {
return tester.controller().os().status().versions().entrySet().stream()
.filter(entry -> entry.getKey().version().equals(version))
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java
index 2e0e780182b..ffb9477c88e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java
@@ -43,6 +43,11 @@ public class OsVersion {
return wanted.isPresent() && !current.equals(wanted);
}
+ /** Returns whether this node is downgrading its version */
+ public boolean downgrading() {
+ return (wanted.isPresent() && current.isPresent()) && wanted.get().isBefore(current.get());
+ }
+
/** Returns whether this is before the given version */
public boolean isBefore(Version version) {
return current.isEmpty() || current.get().isBefore(version);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java
index 23aa03a5315..4ee0774db8f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java
@@ -12,8 +12,8 @@ import java.util.Optional;
import java.util.logging.Logger;
/**
- * An upgrader that delegates the upgrade to the node itself, triggered by changing its wanted OS version. This
- * implementation limits the number of parallel upgrades to avoid overloading the orchestrator with suspension requests.
+ * An upgrader that delegates the upgrade to the node itself, triggered by changing its wanted OS version. Downgrades
+ * are not supported.
*
* Used in clouds where nodes can upgrade themselves in-place, without data loss.
*
@@ -32,6 +32,8 @@ public class DelegatingOsUpgrader extends OsUpgrader {
NodeList activeNodes = nodeRepository.nodes().list(Node.State.active).nodeType(target.nodeType());
Instant now = nodeRepository.clock().instant();
NodeList nodesToUpgrade = activeNodes.not().changingOsVersionTo(target.version())
+ // This upgrader cannot downgrade nodes. We therefore consider only nodes
+ // on a lower version than the target
.osVersionIsBefore(target.version())
.matching(node -> canUpgradeAt(now, node))
.byIncreasingOsVersion()
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java
index 5def863113c..f8becd31792 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java
@@ -43,9 +43,10 @@ public abstract class OsUpgrader {
return Math.max(0, max - upgrading);
}
- /** Returns whether node can upgrade at given instant */
+ /** Returns whether node can change version at given instant */
final boolean canUpgradeAt(Instant instant, Node node) {
- return node.history().age(instant).compareTo(gracePeriod()) > 0;
+ return node.status().osVersion().downgrading() || // Fast-track downgrades
+ node.history().age(instant).compareTo(gracePeriod()) > 0;
}
/** The duration this leaves new nodes alone before scheduling any upgrade */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
index e0affaae666..805793b41a4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java
@@ -66,13 +66,13 @@ public class RebuildingOsUpgrader extends OsUpgrader {
.statefulClusters());
// Rebuild hosts not containing stateful clusters with retiring nodes, up to rebuild limit
- NodeList activeHosts = hostsOfTargetType.state(Node.State.active);
- int rebuildLimit = upgradeSlots(target, activeHosts.rebuilding(softRebuild));
+ NodeList hosts = hostsOfTargetType.state(Node.State.active, Node.State.provisioned);
+ int rebuildLimit = upgradeSlots(target, hosts.rebuilding(softRebuild));
List<Node> hostsToRebuild = new ArrayList<>(rebuildLimit);
- NodeList candidates = activeHosts.not().rebuilding(softRebuild)
- .osVersionIsBefore(target.version())
- .matching(node -> canUpgradeAt(now, node))
- .byIncreasingOsVersion();
+ NodeList candidates = hosts.not().rebuilding(softRebuild)
+ .not().onOsVersion(target.version())
+ .matching(node -> canUpgradeAt(now, node))
+ .byIncreasingOsVersion();
for (Node host : candidates) {
if (hostsToRebuild.size() == rebuildLimit) break;
Set<ClusterId> clustersOnHost = activeNodes.childrenOf(host).statefulClusters();
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
index de4915d60aa..ccb7f40b0de 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java
@@ -47,16 +47,16 @@ public class RetiringOsUpgrader extends OsUpgrader {
/** Returns nodes that are candidates for upgrade */
private NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) {
- NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType());
+ NodeList nodes = allNodes.state(Node.State.active, Node.State.provisioned).nodeType(target.nodeType());
if (softRebuild) {
// Retire only hosts which do not have a replaceable root disk
- activeNodes = activeNodes.not().replaceableRootDisk();
+ nodes = nodes.not().replaceableRootDisk();
}
- return activeNodes.not().deprovisioning()
- .osVersionIsBefore(target.version())
- .matching(node -> canUpgradeAt(instant, node))
- .byIncreasingOsVersion()
- .first(upgradeSlots(target, activeNodes.deprovisioning()));
+ return nodes.not().deprovisioning()
+ .not().onOsVersion(target.version())
+ .matching(node -> canUpgradeAt(instant, node))
+ .byIncreasingOsVersion()
+ .first(upgradeSlots(target, nodes.deprovisioning()));
}
/** Upgrade given host by retiring and deprovisioning it */
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
index 0fa9aa610a4..fda4c47b1ee 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
@@ -38,7 +38,6 @@ import static org.junit.Assert.fail;
public class OsVersionsTest {
private final ProvisioningTester tester = new ProvisioningTester.Builder().build();
- private final ApplicationId infraApplication = ApplicationId.from("hosted-vespa", "infra", "default");
@Test
public void upgrade() {
@@ -159,7 +158,7 @@ public class OsVersionsTest {
}
@Test
- public void upgrade_by_retiring() {
+ public void upgrade_and_downgrade_by_retiring() {
int maxActiveUpgrades = 2;
var versions = new OsVersions(tester.nodeRepository(), Cloud.builder().dynamicProvisioning(true).build());
setMaxActiveUpgrades(maxActiveUpgrades);
@@ -196,7 +195,7 @@ public class OsVersionsTest {
assertEquals(2, deprovisioningChildrenOf(nodesDeprovisioning.asList().get(0)).size());
completeReprovisionOf(nodesDeprovisioning.asList());
- // Remaining hosts complete upgrades one by one
+ // Remaining hosts upgrade, batch by batch
for (int i = 0; i < hostCount - 2; i += maxActiveUpgrades) {
versions.resumeUpgradeOf(NodeType.host, true);
nodesDeprovisioning = hostNodes.get().deprovisioning();
@@ -212,6 +211,20 @@ public class OsVersionsTest {
// Resuming after everything has upgraded does nothing
versions.resumeUpgradeOf(NodeType.host, true);
assertEquals(0, hostNodes.get().deprovisioning().size());
+
+ // Downgrade is triggered
+ var version0 = Version.fromString("7.0");
+ versions.setTarget(NodeType.host, version0, true);
+
+ // Hosts downgrade, batch by batch
+ for (int i = 0; i < hostCount; i += maxActiveUpgrades) {
+ versions.resumeUpgradeOf(NodeType.host, true);
+ nodesDeprovisioning = hostNodes.get().deprovisioning();
+ assertEquals(maxActiveUpgrades, nodesDeprovisioning.size());
+ completeReprovisionOf(nodesDeprovisioning.asList());
+ }
+ assertEquals(hostCount, hostNodes.get().onOsVersion(version0).not().deprovisioning().size());
+ assertEquals(hostCount*2, tester.nodeRepository().nodes().list(Node.State.deprovisioned).size());
}
@Test