diff options
10 files changed, 133 insertions, 38 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java index 68c79fdca7e..b1ea4584497 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java @@ -128,8 +128,12 @@ public abstract class InfrastructureUpgrader<TARGET extends VersionTarget> exten /** Returns whether the upgrader should expect given node to upgrade */ protected abstract boolean expectUpgradeOf(Node node, SystemApplication application, ZoneApi zone); - /** Find the highest version used by nodes satisfying nodeSlice in zone. If no such slice exists, the lowest known version is returned */ - protected final Optional<Version> versionOf(NodeSlice nodeSlice, ZoneApi zone, SystemApplication application, Function<Node, Version> versionField) { + /** + * Find the version currently used by a slice of nodes, in given zone. If no such slice exists, + * the lowest (or highest, when downgrading) overall version is returned. + */ + protected final Optional<Version> versionOf(NodeSlice nodeSlice, ZoneApi zone, SystemApplication application, + Function<Node, Version> versionField, boolean downgrading) { try { Map<Version, Long> nodeCountByVersion = controller().serviceRegistry().configServer() .nodeRepository() @@ -147,11 +151,13 @@ public abstract class InfrastructureUpgrader<TARGET extends VersionTarget> exten } } if (!versionsOfMatchingSlices.isEmpty()) { - // Choose the highest version in case we have several matching slices - return versionsOfMatchingSlices.stream().max(Comparator.naturalOrder()); + return downgrading + ? versionsOfMatchingSlices.stream().min(Comparator.naturalOrder()) + : versionsOfMatchingSlices.stream().max(Comparator.naturalOrder()); } - // No matching slices found, fall back to the lowest known version - return nodeCountByVersion.keySet().stream().min(Comparator.naturalOrder()); + return downgrading + ? nodeCountByVersion.keySet().stream().max(Comparator.naturalOrder()) + : nodeCountByVersion.keySet().stream().min(Comparator.naturalOrder()); } catch (Exception e) { throw new UnreachableNodeRepositoryException(Text.format("Failed to get version for %s in %s: %s", application.id(), zone, diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java index 4df40850cc9..44f0bcecf5f 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgrader.java @@ -54,7 +54,7 @@ public class OsUpgrader extends InfrastructureUpgrader<OsVersionTarget> { @Override protected boolean convergedOn(OsVersionTarget target, SystemApplication application, ZoneApi zone, NodeSlice nodeSlice) { - Version currentVersion = versionOf(nodeSlice, zone, application, Node::currentOsVersion).orElse(target.osVersion().version()); + Version currentVersion = versionOf(nodeSlice, zone, application, Node::currentOsVersion, target.downgrade()).orElse(target.version()); return satisfiedBy(currentVersion, target); } @@ -67,9 +67,11 @@ public class OsUpgrader extends InfrastructureUpgrader<OsVersionTarget> { @Override protected Optional<OsVersionTarget> target() { - // Return target if we have nodes in this cloud on the wrong version + // Return target if we have nodes in this cloud on the wrong version, or if we're downgrading a zone which does + // not support downgrading all nodes return controller().os().target(cloud) - .filter(target -> controller().os().status().nodesIn(cloud).stream() + .filter(target -> (target.downgrade() && !downgradingSupported()) || + controller().os().status().nodesIn(cloud).stream() .anyMatch(node -> !satisfiedBy(node.currentVersion(), target))); } @@ -79,19 +81,23 @@ public class OsUpgrader extends InfrastructureUpgrader<OsVersionTarget> { return controller().serviceRegistry().configServer().nodeRepository() .targetVersionsOf(zone.getVirtualId()) .osVersion(application.nodeType()) - .map(currentVersion -> !satisfiedBy(currentVersion, target)) + .map(currentVersion -> !currentVersion.equals(target.version())) .orElse(true); } - private static boolean satisfiedBy(Version version, OsVersionTarget target) { - if (target.downgrade()) { - // When downgrading we want an exact version + private boolean satisfiedBy(Version version, OsVersionTarget target) { + if (target.downgrade() && downgradingSupported()) { + // When downgrading we want an exact version if the cloud supports downgrades return version.equals(target.osVersion().version()); } // Otherwise, matching or later version is fine return !version.isBefore(target.osVersion().version()); } + private boolean downgradingSupported() { + return !controller().zoneRegistry().zones().all().dynamicallyProvisioned().in(cloud).zones().isEmpty(); + } + /** Returns whether node currently allows upgrades */ public static boolean canUpgrade(Node node, boolean includeDeferring) { return (includeDeferring || !node.deferOsUpgrade()) && upgradableNodeStates.contains(node.state()); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java index 000acd16155..effcc4dd4df 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemUpgrader.java @@ -40,7 +40,7 @@ public class SystemUpgrader extends InfrastructureUpgrader<VespaVersionTarget> { @Override protected boolean convergedOn(VespaVersionTarget target, SystemApplication application, ZoneApi zone, NodeSlice nodeSlice) { - Optional<Version> currentVersion = versionOf(nodeSlice, zone, application, Node::currentVersion); + Optional<Version> currentVersion = versionOf(nodeSlice, zone, application, Node::currentVersion, target.downgrade()); // Skip application convergence check if there are no nodes belonging to the application in the zone if (currentVersion.isEmpty()) return true; @@ -78,7 +78,7 @@ public class SystemUpgrader extends InfrastructureUpgrader<VespaVersionTarget> { // For applications with package we do not have a zone-wide version target. This means that we must check // the wanted version of each node. boolean zoneHasSharedRouting = controller().zoneRegistry().routingMethod(zone.getId()).isShared(); - return versionOf(NodeSlice.ALL, zone, application, Node::wantedVersion) + return versionOf(NodeSlice.ALL, zone, application, Node::wantedVersion, target.downgrade()) .map(wantedVersion -> !wantedVersion.equals(target.version())) .orElse(zoneHasSharedRouting); // Always upgrade if zone uses shared routing, but has no nodes allocated yet } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java index 056db4f119c..fad78edc58f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgraderTest.java @@ -18,10 +18,14 @@ import com.yahoo.vespa.hosted.controller.versions.NodeVersion; import org.junit.jupiter.api.Test; import java.time.Duration; +import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.function.Function; import java.util.function.UnaryOperator; +import java.util.stream.Collectors; import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -198,7 +202,8 @@ public class OsUpgraderTest { statusUpdater.maintain(); // All zones upgrade - for (var zone : List.of(zone1, zone2)) { + List<ZoneApi> zones = new ArrayList<>(List.of(zone1, zone2)); + for (var zone : zones) { osUpgrader.maintain(); completeUpgrade(version1, SystemApplication.tenantHost, zone); statusUpdater.maintain(); @@ -208,9 +213,17 @@ public class OsUpgraderTest { // Downgrade is triggered tester.controller().os().upgradeTo(version0, cloud, true, false); + // Zone order is reversed + Collections.reverse(zones); - // All zones downgrade, in reverse order - for (var zone : List.of(zone2, zone1)) { + // One host in first zone downgrades. Wanted version is not changed for second zone yet + osUpgrader.maintain(); + completeUpgrade(1, version0, SystemApplication.tenantHost, zones.get(0)); + osUpgrader.maintain(); + assertWanted(version1, SystemApplication.tenantHost, zones.get(1)); + + // All zones downgrade + for (var zone : zones) { osUpgrader.maintain(); completeUpgrade(version0, SystemApplication.tenantHost, zone); statusUpdater.maintain(); @@ -219,6 +232,55 @@ public class OsUpgraderTest { .allMatch(node -> node.currentVersion().equals(version0)), "All nodes on target version"); } + @Test + public void downgrade_os_partially() { + CloudName cloud = CloudName.from("cloud"); + ZoneApi zone1 = zone("dev.us-east-1", cloud); + ZoneApi zone2 = zone("prod.us-west-1", cloud); + UpgradePolicy upgradePolicy = UpgradePolicy.builder() + .upgrade(zone1) + .upgrade(zone2) + .build(); + OsUpgrader osUpgrader = osUpgrader(upgradePolicy, cloud, false); + + // Bootstrap system + tester.configServer().bootstrap(List.of(zone1.getId(), zone2.getId()), + List.of(SystemApplication.tenantHost)); + + // New OS version released + Version version0 = Version.fromString("1.0"); + Version version1 = Version.fromString("2.0"); + tester.controller().os().upgradeTo(version1, cloud, false, false); + statusUpdater.maintain(); + + // All zones upgrade + for (var zone : List.of(zone1, zone2)) { + osUpgrader.maintain(); + completeUpgrade(version1, SystemApplication.tenantHost, zone); + statusUpdater.maintain(); + } + assertTrue(tester.controller().os().status().nodesIn(cloud).stream() + .allMatch(node -> node.currentVersion().equals(version1)), "All nodes on target version"); + + // Downgrade is triggered + tester.controller().os().upgradeTo(version0, cloud, true, false); + + // All zones downgrade, in reverse order + for (var zone : List.of(zone2, zone1)) { + osUpgrader.maintain(); + // Partial downgrading happens, as this decision is left up to the zone. Downgrade target is still set in + // all zones as a best-effort, and to halt any further upgrades + completeUpgrade(1, version0, SystemApplication.tenantHost, zone); + statusUpdater.maintain(); + } + int zoneCount = 2; + Map<Version, Long> currentVersions = tester.controller().os().status().nodesIn(cloud).stream() + .collect(Collectors.groupingBy(NodeVersion::currentVersion, + Collectors.counting())); + assertEquals(1 * zoneCount, currentVersions.get(version0)); + assertEquals(2 * zoneCount, currentVersions.get(version1)); + } + private List<NodeVersion> nodesOn(Version version) { return tester.controller().os().status().versions().entrySet().stream() .filter(entry -> entry.getKey().version().equals(version)) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java index 2e0e780182b..ffb9477c88e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/OsVersion.java @@ -43,6 +43,11 @@ public class OsVersion { return wanted.isPresent() && !current.equals(wanted); } + /** Returns whether this node is downgrading its version */ + public boolean downgrading() { + return (wanted.isPresent() && current.isPresent()) && wanted.get().isBefore(current.get()); + } + /** Returns whether this is before the given version */ public boolean isBefore(Version version) { return current.isEmpty() || current.get().isBefore(version); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java index 23aa03a5315..4ee0774db8f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java @@ -12,8 +12,8 @@ import java.util.Optional; import java.util.logging.Logger; /** - * An upgrader that delegates the upgrade to the node itself, triggered by changing its wanted OS version. This - * implementation limits the number of parallel upgrades to avoid overloading the orchestrator with suspension requests. + * An upgrader that delegates the upgrade to the node itself, triggered by changing its wanted OS version. Downgrades + * are not supported. * * Used in clouds where nodes can upgrade themselves in-place, without data loss. * @@ -32,6 +32,8 @@ public class DelegatingOsUpgrader extends OsUpgrader { NodeList activeNodes = nodeRepository.nodes().list(Node.State.active).nodeType(target.nodeType()); Instant now = nodeRepository.clock().instant(); NodeList nodesToUpgrade = activeNodes.not().changingOsVersionTo(target.version()) + // This upgrader cannot downgrade nodes. We therefore consider only nodes + // on a lower version than the target .osVersionIsBefore(target.version()) .matching(node -> canUpgradeAt(now, node)) .byIncreasingOsVersion() diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java index 5def863113c..f8becd31792 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java @@ -43,9 +43,10 @@ public abstract class OsUpgrader { return Math.max(0, max - upgrading); } - /** Returns whether node can upgrade at given instant */ + /** Returns whether node can change version at given instant */ final boolean canUpgradeAt(Instant instant, Node node) { - return node.history().age(instant).compareTo(gracePeriod()) > 0; + return node.status().osVersion().downgrading() || // Fast-track downgrades + node.history().age(instant).compareTo(gracePeriod()) > 0; } /** The duration this leaves new nodes alone before scheduling any upgrade */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java index e0affaae666..805793b41a4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java @@ -66,13 +66,13 @@ public class RebuildingOsUpgrader extends OsUpgrader { .statefulClusters()); // Rebuild hosts not containing stateful clusters with retiring nodes, up to rebuild limit - NodeList activeHosts = hostsOfTargetType.state(Node.State.active); - int rebuildLimit = upgradeSlots(target, activeHosts.rebuilding(softRebuild)); + NodeList hosts = hostsOfTargetType.state(Node.State.active, Node.State.provisioned); + int rebuildLimit = upgradeSlots(target, hosts.rebuilding(softRebuild)); List<Node> hostsToRebuild = new ArrayList<>(rebuildLimit); - NodeList candidates = activeHosts.not().rebuilding(softRebuild) - .osVersionIsBefore(target.version()) - .matching(node -> canUpgradeAt(now, node)) - .byIncreasingOsVersion(); + NodeList candidates = hosts.not().rebuilding(softRebuild) + .not().onOsVersion(target.version()) + .matching(node -> canUpgradeAt(now, node)) + .byIncreasingOsVersion(); for (Node host : candidates) { if (hostsToRebuild.size() == rebuildLimit) break; Set<ClusterId> clustersOnHost = activeNodes.childrenOf(host).statefulClusters(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java index de4915d60aa..ccb7f40b0de 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java @@ -47,16 +47,16 @@ public class RetiringOsUpgrader extends OsUpgrader { /** Returns nodes that are candidates for upgrade */ private NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) { - NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType()); + NodeList nodes = allNodes.state(Node.State.active, Node.State.provisioned).nodeType(target.nodeType()); if (softRebuild) { // Retire only hosts which do not have a replaceable root disk - activeNodes = activeNodes.not().replaceableRootDisk(); + nodes = nodes.not().replaceableRootDisk(); } - return activeNodes.not().deprovisioning() - .osVersionIsBefore(target.version()) - .matching(node -> canUpgradeAt(instant, node)) - .byIncreasingOsVersion() - .first(upgradeSlots(target, activeNodes.deprovisioning())); + return nodes.not().deprovisioning() + .not().onOsVersion(target.version()) + .matching(node -> canUpgradeAt(instant, node)) + .byIncreasingOsVersion() + .first(upgradeSlots(target, nodes.deprovisioning())); } /** Upgrade given host by retiring and deprovisioning it */ diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java index 0fa9aa610a4..fda4c47b1ee 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java @@ -38,7 +38,6 @@ import static org.junit.Assert.fail; public class OsVersionsTest { private final ProvisioningTester tester = new ProvisioningTester.Builder().build(); - private final ApplicationId infraApplication = ApplicationId.from("hosted-vespa", "infra", "default"); @Test public void upgrade() { @@ -159,7 +158,7 @@ public class OsVersionsTest { } @Test - public void upgrade_by_retiring() { + public void upgrade_and_downgrade_by_retiring() { int maxActiveUpgrades = 2; var versions = new OsVersions(tester.nodeRepository(), Cloud.builder().dynamicProvisioning(true).build()); setMaxActiveUpgrades(maxActiveUpgrades); @@ -196,7 +195,7 @@ public class OsVersionsTest { assertEquals(2, deprovisioningChildrenOf(nodesDeprovisioning.asList().get(0)).size()); completeReprovisionOf(nodesDeprovisioning.asList()); - // Remaining hosts complete upgrades one by one + // Remaining hosts upgrade, batch by batch for (int i = 0; i < hostCount - 2; i += maxActiveUpgrades) { versions.resumeUpgradeOf(NodeType.host, true); nodesDeprovisioning = hostNodes.get().deprovisioning(); @@ -212,6 +211,20 @@ public class OsVersionsTest { // Resuming after everything has upgraded does nothing versions.resumeUpgradeOf(NodeType.host, true); assertEquals(0, hostNodes.get().deprovisioning().size()); + + // Downgrade is triggered + var version0 = Version.fromString("7.0"); + versions.setTarget(NodeType.host, version0, true); + + // Hosts downgrade, batch by batch + for (int i = 0; i < hostCount; i += maxActiveUpgrades) { + versions.resumeUpgradeOf(NodeType.host, true); + nodesDeprovisioning = hostNodes.get().deprovisioning(); + assertEquals(maxActiveUpgrades, nodesDeprovisioning.size()); + completeReprovisionOf(nodesDeprovisioning.asList()); + } + assertEquals(hostCount, hostNodes.get().onOsVersion(version0).not().deprovisioning().size()); + assertEquals(hostCount*2, tester.nodeRepository().nodes().list(Node.State.deprovisioned).size()); } @Test |