diff options
5 files changed, 100 insertions, 18 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java index 22df5ca559e..4a00a272c75 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.collections.AbstractFilteringList; import com.yahoo.component.Version; +import com.yahoo.vespa.hosted.controller.application.Change; import java.time.Instant; import java.util.Collection; @@ -36,8 +37,10 @@ public class DeploymentStatusList extends AbstractFilteringList<DeploymentStatus /** Returns the subset of applications which have been failing an application change since the given instant */ public DeploymentStatusList failingApplicationChangeSince(Instant threshold) { - return matching(status -> status.instanceJobs().values().stream() - .anyMatch(jobs -> failingApplicationChangeSince(jobs, threshold))); + return matching(status -> status.instanceJobs().entrySet().stream() + .anyMatch(jobs -> failingApplicationChangeSince(jobs.getValue(), + status.application().require(jobs.getKey().instance()).change(), + threshold))); } private static boolean failingUpgradeToVersionSince(JobList jobs, Version version, Instant threshold) { @@ -47,10 +50,8 @@ public class DeploymentStatusList extends AbstractFilteringList<DeploymentStatus .isEmpty(); } - private static boolean failingApplicationChangeSince(JobList jobs, Instant threshold) { - return ! jobs.failingApplicationChange() - .firstFailing().endedNoLaterThan(threshold) - .isEmpty(); + private static boolean failingApplicationChangeSince(JobList jobs, Change change, Instant threshold) { + return change.revision().map(revision -> ! jobs.failingWithBrokenRevisionSince(revision, threshold).isEmpty()).orElse(false); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index c28f94bc4d7..7ceeda08d3a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -61,6 +61,7 @@ import static java.util.stream.Collectors.toMap; public class DeploymentTrigger { public static final Duration maxPause = Duration.ofDays(3); + public static final Duration maxFailingRevisionTime = Duration.ofDays(5); private final static Logger log = Logger.getLogger(DeploymentTrigger.class.getName()); private final Controller controller; @@ -448,6 +449,8 @@ public class DeploymentTrigger { private boolean acceptNewRevision(DeploymentStatus status, InstanceName instance, RevisionId revision) { if (status.application().deploymentSpec().instance(instance).isEmpty()) return false; // Unknown instance. + if ( ! status.jobs().failingWithBrokenRevisionSince(revision, clock.instant().minus(maxFailingRevisionTime)) + .isEmpty()) return false; // Don't deploy a broken revision. boolean isChangingRevision = status.application().require(instance).change().revision().isPresent(); DeploymentInstanceSpec spec = status.application().deploymentSpec().requireInstance(instance); Predicate<RevisionId> revisionFilter = spec.revisionTarget() == DeploymentSpec.RevisionTarget.next diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java index 551f841233e..3074c9ac3ba 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java @@ -4,7 +4,6 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.collections.AbstractFilteringList; import com.yahoo.component.Version; import com.yahoo.config.provision.InstanceName; -import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RevisionId; @@ -74,6 +73,14 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { return matching(JobList::failingApplicationChange); } + /** Returns the subset of jobs which are failing because of an application change, and have been since the threshold, on the given revision. */ + public JobList failingWithBrokenRevisionSince(RevisionId broken, Instant threshold) { + return failingApplicationChange().matching(job -> job.runs().values().stream() + .anyMatch(run -> run.versions().targetRevision().equals(broken) + && run.hasFailed() + && run.start().isBefore(threshold))); + } + /** Returns the subset of jobs which are failing with the given run status. */ public JobList withStatus(RunStatus status) { return matching(job -> job.lastStatus().map(status::equals).orElse(false)); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index d654f63fff2..a2fb0df626f 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -5,17 +5,20 @@ import com.yahoo.component.Version; import com.yahoo.config.application.api.DeploymentSpec.UpgradePolicy; import com.yahoo.config.provision.ApplicationId; import com.yahoo.transaction.Mutex; -import com.yahoo.vespa.curator.Lock; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.InstanceList; +import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatusList; +import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger; +import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; import com.yahoo.vespa.hosted.controller.versions.VersionStatus; import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence; import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -23,6 +26,7 @@ import java.util.Map; import java.util.Optional; import java.util.OptionalInt; import java.util.Random; +import java.util.Set; import java.util.function.UnaryOperator; import java.util.logging.Level; import java.util.logging.Logger; @@ -58,18 +62,22 @@ public class Upgrader extends ControllerMaintainer { cancelBrokenUpgrades(versionStatus); OptionalInt targetMajorVersion = targetMajorVersion(); - InstanceList instances = instances(versionStatus); + DeploymentStatusList deploymentStatuses = deploymentStatuses(versionStatus); for (UpgradePolicy policy : UpgradePolicy.values()) - updateTargets(versionStatus, instances, policy, targetMajorVersion); + updateTargets(versionStatus, deploymentStatuses, policy, targetMajorVersion); return 1.0; } + private DeploymentStatusList deploymentStatuses(VersionStatus versionStatus) { + return controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable()) + .withProjectId(), + versionStatus); + } + /** Returns a list of all production application instances, except those which are pinned, which we should not manipulate here. */ - private InstanceList instances(VersionStatus versionStatus) { - return InstanceList.from(controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable()) - .withProjectId(), - versionStatus)) + private InstanceList instances(DeploymentStatusList deploymentStatuses) { + return InstanceList.from(deploymentStatuses) .withDeclaredJobs() .shuffle(random) .byIncreasingDeployedVersion() @@ -78,7 +86,7 @@ public class Upgrader extends ControllerMaintainer { private void cancelBrokenUpgrades(VersionStatus versionStatus) { // Cancel upgrades to broken targets (let other ongoing upgrades complete to avoid starvation) - InstanceList instances = instances(controller().readVersionStatus()); + InstanceList instances = instances(deploymentStatuses(controller().readVersionStatus())); for (VespaVersion version : versionStatus.versions()) { if (version.confidence() == Confidence.broken) cancelUpgradesOf(instances.upgradingTo(version.versionNumber()).not().with(UpgradePolicy.canary), @@ -86,8 +94,12 @@ public class Upgrader extends ControllerMaintainer { } } - private void updateTargets(VersionStatus versionStatus, InstanceList instances, UpgradePolicy policy, OptionalInt targetMajorVersion) { + private void updateTargets(VersionStatus versionStatus, DeploymentStatusList deploymentStatuses, UpgradePolicy policy, OptionalInt targetMajorVersion) { + InstanceList instances = instances(deploymentStatuses); InstanceList remaining = instances.with(policy); + Instant failureThreshold = controller().clock().instant().minus(DeploymentTrigger.maxFailingRevisionTime); + Set<ApplicationId> failingRevision = InstanceList.from(deploymentStatuses.failingApplicationChangeSince(failureThreshold)).asSet(); + List<Version> targetAndNewer = new ArrayList<>(); UnaryOperator<InstanceList> cancellationCriterion = policy == UpgradePolicy.canary ? i -> i.not().upgradingTo(targetAndNewer) : i -> i.failing() @@ -103,13 +115,16 @@ public class Upgrader extends ControllerMaintainer { // Prefer the newest target for each instance. remaining = remaining.not().matching(eligible.asList()::contains) .not().hasCompleted(Change.of(version)); - for (ApplicationId id : outdated.and(eligible.not().upgrading()).not().changingRevision()) + for (ApplicationId id : outdated.and(eligible.not().upgrading())) targets.put(id, version); } int numberToUpgrade = policy == UpgradePolicy.canary ? instances.size() : numberOfApplicationsToUpgrade(); for (ApplicationId id : instances.matching(targets.keySet()::contains).first(numberToUpgrade)) { log.log(Level.INFO, "Triggering upgrade to " + targets.get(id) + " for " + id); + if (failingRevision.contains(id)) + controller().applications().deploymentTrigger().cancelChange(id, ChangesToCancel.APPLICATION); + controller().applications().deploymentTrigger().triggerChange(id, Change.of(targets.get(id))); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 3b21d3a017e..5968b490c09 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentContext; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; +import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger; import com.yahoo.vespa.hosted.controller.deployment.Run; import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Test; @@ -35,6 +36,7 @@ import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.pro import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.stagingTest; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.systemTest; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.ALL; +import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.APPLICATION; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PIN; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PLATFORM; import static org.junit.Assert.assertEquals; @@ -995,6 +997,60 @@ public class UpgraderTest { } @Test + public void testSettingFailingRevisionAside() { + DeploymentContext app = tester.newDeploymentContext().submit().deploy(); + + // New revision fails. + app.submit(); + Optional<RevisionId> revision1 = app.lastSubmission(); + app.failDeployment(systemTest); + + // New version is not targeted. + Version version1 = new Version("7"); + tester.controllerTester().upgradeSystem(version1); + assertEquals(Change.of(revision1.get()), app.instance().change()); + + tester.upgrader().run(); + assertEquals(Change.of(revision1.get()), app.instance().change()); + + // Broken revision is replaced by a new attempt, which also fails, and cancellation is not yet triggered. + tester.clock().advance(DeploymentTrigger.maxFailingRevisionTime.plusSeconds(1)); + app.submit(); + Optional<RevisionId> revision2 = app.lastSubmission(); + app.failDeployment(systemTest); + tester.upgrader().run(); + assertEquals(Change.of(revision2.get()), app.instance().change()); + + // Broken revision is cancelled, and new version targeted, after some time. + tester.clock().advance(DeploymentTrigger.maxFailingRevisionTime.plusSeconds(1)); + tester.upgrader().run(); + assertEquals(Change.of(version1), app.instance().change()); + + // Broken revision is not targeted again. + app.triggerJobs(); + tester.upgrader().run(); + tester.outstandingChangeDeployer().run(); + assertEquals(Change.of(version1), app.instance().change()); + + app.failDeployment(systemTest); + tester.upgrader().run(); + tester.outstandingChangeDeployer().run(); + assertEquals(Change.of(version1), app.instance().change()); + + // Revision gets a second change when upgrade fixes the failing job. + tester.clock().advance(Duration.ofDays(12)); // Time for retries. + app.runJob(systemTest).jobAborted(stagingTest).runJob(stagingTest).runJob(productionUsCentral1); + tester.upgrader().run(); + tester.outstandingChangeDeployer().run(); + + assertEquals(Change.of(version1).with(revision2.get()), app.instance().change()); + app.runJob(systemTest).runJob(stagingTest).runJob(productionUsCentral1); // Revision rolls. + app.runJob(productionUsEast3).runJob(productionUsWest1); // Upgrade completes. + app.runJob(productionUsEast3).runJob(productionUsWest1); // Revision completes. + assertEquals(Change.empty(), app.instance().change()); + } + + @Test public void testsEachUpgradeCombinationWithFailingDeployments() { Version v1 = Version.fromString("6.1"); tester.controllerTester().upgradeSystem(v1); @@ -1085,7 +1141,7 @@ public class UpgraderTest { default2.instanceId(), default2); // Throttle upgrades per run - ((ManualClock) tester.controller().clock()).setInstant(Instant.ofEpochMilli(1589787109000L)); // Fixed random seed + ((ManualClock) tester.controller().clock()).setInstant(Instant.ofEpochMilli(1589787107000L)); // Fixed random seed Upgrader upgrader = new Upgrader(tester.controller(), Duration.ofMinutes(10)); upgrader.setUpgradesPerMinute(0.1); |