diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2022-01-05 10:50:42 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-05 10:50:42 +0100 |
commit | cfaa0be19da00466a73b57a9630477eabe46a8bd (patch) | |
tree | ae162f69ebf3a80dac151fb875454f5e295b727b /controller-server | |
parent | 0188f4acf60f33b09940c35c6e1c4d435ceb2891 (diff) | |
parent | 4794cba6b41db063433e35e0d95e5bc879b06be7 (diff) |
Merge pull request #20654 from vespa-engine/mpolden/skip-rescheduling
Do not reschedule upgrade for instances without production steps
Diffstat (limited to 'controller-server')
2 files changed, 60 insertions, 52 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java index 0e8a59134ed..85762cf530a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java @@ -1,7 +1,6 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.application; -import com.google.common.collect.ImmutableMap; import com.yahoo.collections.AbstractFilteringList; import com.yahoo.component.Version; import com.yahoo.config.application.api.DeploymentSpec; @@ -14,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatusList; import java.time.Instant; import java.util.Collection; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -31,16 +31,7 @@ public class InstanceList extends AbstractFilteringList<ApplicationId, InstanceL private InstanceList(Collection<? extends ApplicationId> items, boolean negate, Map<ApplicationId, DeploymentStatus> instances) { super(items, negate, (i, n) -> new InstanceList(i, n, instances)); - this.instances = instances; - } - - public static InstanceList from(DeploymentStatusList statuses) { - ImmutableMap.Builder<ApplicationId, DeploymentStatus> builder = ImmutableMap.builder(); - for (DeploymentStatus status : statuses.asList()) - for (InstanceName instance : status.application().deploymentSpec().instanceNames()) - builder.put(status.application().id().instance(instance), status); - Map<ApplicationId, DeploymentStatus> map = builder.build(); - return new InstanceList(map.keySet(), false, map); + this.instances = Map.copyOf(instances); } /** @@ -62,7 +53,7 @@ public class InstanceList extends AbstractFilteringList<ApplicationId, InstanceL .map(readyAt -> ! readyAt.isAfter(instant)).orElse(false)); } - /** Returns the subset of instances which have at least one productiog deployment */ + /** Returns the subset of instances which have at least one production deployment */ public InstanceList withProductionDeployment() { return matching(id -> instance(id).productionDeployments().size() > 0); } @@ -80,9 +71,9 @@ public class InstanceList extends AbstractFilteringList<ApplicationId, InstanceL .anyMatch(deployment -> deployment.version().isBefore(version))); } - /** Returns the subset of instances which have changes left to deploy; blocked, or deploying */ - public InstanceList withChanges() { - return matching(id -> instance(id).change().hasTargets() || instances.get(id).outstandingChange(id.instance()).hasTargets()); + /** Returns the subset of instances that has completed deployment of given change */ + public InstanceList hasCompleted(Change change) { + return matching(id -> instances.get(id).jobsToRun(Map.of(id.instance(), change)).isEmpty()); } /** Returns the subset of instances which are currently deploying a change */ @@ -153,4 +144,12 @@ public class InstanceList extends AbstractFilteringList<ApplicationId, InstanceL return application(id).require(id.instance()); } + public static InstanceList from(DeploymentStatusList statuses) { + Map<ApplicationId, DeploymentStatus> instances = new HashMap<>(); + for (DeploymentStatus status : statuses.asList()) + for (InstanceName instance : status.application().deploymentSpec().instanceNames()) + instances.put(status.application().id().instance(instance), status); + return new InstanceList(instances.keySet(), false, instances); + } + } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index 7a7eee7183b..9af838b1f55 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -58,39 +58,8 @@ public class Upgrader extends ControllerMaintainer { Collection<Version> defaultTargets = targetVersions(Confidence.normal, versionStatus); Collection<Version> conservativeTargets = targetVersions(Confidence.high, versionStatus); - // Cancel upgrades to broken targets (let other ongoing upgrades complete to avoid starvation) - for (VespaVersion version : versionStatus.versions()) { - if (version.confidence() == Confidence.broken) - cancelUpgradesOf(instances().upgradingTo(version.versionNumber()) - .not().with(UpgradePolicy.canary), - version.versionNumber() + " is broken"); - } - - // Canaries should always try the canary target - cancelUpgradesOf(instances().upgrading() - .not().upgradingTo(canaryTarget) - .with(UpgradePolicy.canary), - "Outdated target version for Canaries"); - - // Cancel *failed* upgrades to earlier versions, as the new version may fix it - String reason = "Failing on outdated version"; - cancelUpgradesOf(instances().upgrading() - .failing() - .not().upgradingTo(defaultTargets) - .with(UpgradePolicy.defaultPolicy), - reason); - cancelUpgradesOf(instances().upgrading() - .failing() - .not().upgradingTo(conservativeTargets) - .with(UpgradePolicy.conservative), - reason); - - // Schedule the right upgrades - InstanceList instances = instances(); - Optional<Integer> targetMajorVersion = targetMajorVersion(); - upgrade(instances.with(UpgradePolicy.canary), canaryTarget, targetMajorVersion, instances.size()); - defaultTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.defaultPolicy), target, targetMajorVersion, numberOfApplicationsToUpgrade())); - conservativeTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.conservative), target, targetMajorVersion, numberOfApplicationsToUpgrade())); + cancelUpgrades(versionStatus, canaryTarget, defaultTargets, conservativeTargets); + upgrade(versionStatus, canaryTarget, defaultTargets, conservativeTargets); return 1.0; } @@ -108,22 +77,62 @@ public class Upgrader extends ControllerMaintainer { } /** Returns a list of all production application instances, except those which are pinned, which we should not manipulate here. */ - private InstanceList instances() { - return InstanceList.from(controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable()))) + private InstanceList instances(Version systemVersion) { + return InstanceList.from(controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable()), systemVersion)) .withDeclaredJobs() .unpinned(); } + private void cancelUpgrades(VersionStatus versionStatus, Version canaryTarget, Collection<Version> defaultTargets, Collection<Version> conservativeTargets) { + InstanceList instances = instances(controller().systemVersion(versionStatus)); + // Cancel upgrades to broken targets (let other ongoing upgrades complete to avoid starvation) + for (VespaVersion version : versionStatus.versions()) { + if (version.confidence() == Confidence.broken) + cancelUpgradesOf(instances.upgradingTo(version.versionNumber()) + .not().with(UpgradePolicy.canary), + version.versionNumber() + " is broken"); + } + + // Canaries should always try the canary target + cancelUpgradesOf(instances.upgrading() + .not().upgradingTo(canaryTarget) + .with(UpgradePolicy.canary), + "Outdated target version for Canaries"); + + // Cancel *failed* upgrades to earlier versions, as the new version may fix it + String reason = "Failing on outdated version"; + cancelUpgradesOf(instances.upgrading() + .failing() + .not().upgradingTo(defaultTargets) + .with(UpgradePolicy.defaultPolicy), + reason); + cancelUpgradesOf(instances.upgrading() + .failing() + .not().upgradingTo(conservativeTargets) + .with(UpgradePolicy.conservative), + reason); + } + + private void upgrade(VersionStatus versionStatus, Version canaryTarget, Collection<Version> defaultTargets, Collection<Version> conservativeTargets) { + InstanceList instances = instances(controller().systemVersion(versionStatus)); + Optional<Integer> targetMajorVersion = targetMajorVersion(); + upgrade(instances.with(UpgradePolicy.canary), canaryTarget, targetMajorVersion, instances.size()); + defaultTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.defaultPolicy), target, targetMajorVersion, numberOfApplicationsToUpgrade())); + conservativeTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.conservative), target, targetMajorVersion, numberOfApplicationsToUpgrade())); + } + private void upgrade(InstanceList instances, Version version, Optional<Integer> targetMajorVersion, int numberToUpgrade) { + Change change = Change.of(version); instances.not().failingOn(version) .allowMajorVersion(version.getMajor(), targetMajorVersion.orElse(version.getMajor())) .not().deploying() + .not().hasCompleted(change) // Avoid rescheduling change for instances without production steps .onLowerVersionThan(version) .canUpgradeAt(version, controller().clock().instant()) .shuffle(random) // Shuffle so we do not always upgrade instances in the same order .byIncreasingDeployedVersion() - .first(numberToUpgrade).asList() - .forEach(instance -> controller().applications().deploymentTrigger().triggerChange(instance, Change.of(version))); + .first(numberToUpgrade) + .forEach(instance -> controller().applications().deploymentTrigger().triggerChange(instance, change)); } private void cancelUpgradesOf(InstanceList instances, String reason) { |