aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarald Musum <musum@yahoo-inc.com>2017-09-29 18:35:32 +0200
committerGitHub <noreply@github.com>2017-09-29 18:35:32 +0200
commit61679bfc56d0f9c5df09e92b70375ec1262179f0 (patch)
tree7e083745a8e04c6621073fede79899552165dbf8
parent2d5eb7d0976ac0650ab1d2acfcca4984252f8349 (diff)
parentdfe49809cd3d23c9d7e059ddbf6aee4f763942ab (diff)
Merge pull request #3585 from vespa-engine/mpolden/retry-from-first-job
Retry from first job
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java13
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java63
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java56
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java6
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java17
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java80
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java66
7 files changed, 135 insertions, 166 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
index fd3fa867c99..ab1b5aa3cd4 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
@@ -127,9 +127,9 @@ public class ApplicationList {
return listOf(list.stream().sorted(Comparator.comparing(application -> application.deployedVersion().orElse(Version.emptyVersion))));
}
- /** Returns the subset of applications which currently do not have any job in progress for the given change */
- public ApplicationList notRunningJobFor(Change.VersionChange change) {
- return listOf(list.stream().filter(a -> !hasRunningJob(a, change)));
+ /** Returns the subset of applications that are not upgrading or started upgrading before the grace period */
+ public ApplicationList notCurrentlyUpgrading(Change.VersionChange change, Instant startOfGracePeriod) {
+ return listOf(list.stream().filter(a -> !currentlyUpgrading(change, a, startOfGracePeriod)));
}
// ----------------------------------- Internal helpers
@@ -157,11 +157,12 @@ public class ApplicationList {
return false;
}
- private static boolean hasRunningJob(Application application, Change.VersionChange change) {
+ private static boolean currentlyUpgrading(Change.VersionChange change, Application application, Instant instant) {
return application.deploymentJobs().jobStatus().values().stream()
.filter(JobStatus::inProgress)
- .filter(jobStatus -> jobStatus.lastTriggered().isPresent())
- .map(jobStatus -> jobStatus.lastTriggered().get())
+ .filter(status -> status.lastTriggered().isPresent())
+ .map(status -> status.lastTriggered().get())
+ .filter(jobRun -> jobRun.at().isAfter(instant))
.anyMatch(jobRun -> jobRun.version().equals(change.version()));
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index 46f11ef208c..268965850ba 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.ApplicationController;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.application.Change;
+import com.yahoo.vespa.hosted.controller.application.DeploymentJobs;
import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobError;
import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobReport;
import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType;
@@ -31,6 +32,7 @@ import java.util.logging.Logger;
* This class is multithread safe.
*
* @author bratseth
+ * @author mpolden
*/
public class DeploymentTrigger {
@@ -97,27 +99,30 @@ public class DeploymentTrigger {
/**
* Called periodically to cause triggering of jobs in the background
*/
- public void triggerFailing(ApplicationId applicationId, String cause) {
+ public void triggerFailing(ApplicationId applicationId, Duration timeout) {
try (Lock lock = applications().lock(applicationId)) {
Application application = applications().require(applicationId);
- if (shouldRetryFromBeginning(application)) {
- // failed for a long time: Discard existing change and restart from the component job
- application = application.withDeploying(Optional.empty());
- application = trigger(JobType.component, application, false, "Retrying failing deployment from beginning: " + cause, lock);
- applications().store(application, lock);
- } else {
- // retry the failed job (with backoff)
- for (JobType jobType : order.jobsFrom(application.deploymentSpec())) { // retry the *first* failing job
- JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType);
- if (isFailing(jobStatus)) {
- if (shouldRetryNow(jobStatus)) {
- application = trigger(jobType, application, false, "Retrying failing job: " + cause, lock);
- applications().store(application, lock);
- }
- break;
+ if (!application.deploying().isPresent()) { // No ongoing change, no need to retry
+ return;
+ }
+ // Retry first failing job
+ for (JobType jobType : order.jobsFrom(application.deploymentSpec())) {
+ JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType);
+ if (isFailing(application.deploying().get(), jobStatus)) {
+ if (shouldRetryNow(jobStatus)) {
+ application = trigger(jobType, application, false, "Retrying failing job", lock);
+ applications().store(application, lock);
}
+ break;
}
}
+ // Retry dead job
+ Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs(), timeout);
+ if (firstDeadJob.isPresent()) {
+ application = trigger(firstDeadJob.get().type(), application, false, "Retrying dead job",
+ lock);
+ applications().store(application, lock);
+ }
}
}
@@ -184,24 +189,26 @@ public class DeploymentTrigger {
//--- End of methods which triggers deployment jobs ----------------------------
private ApplicationController applications() { return controller.applications(); }
-
- private boolean isFailing(JobStatus jobStatusOrNull) {
- return jobStatusOrNull != null && !jobStatusOrNull.isSuccess();
+
+ /** Returns whether a job is failing for the current change in the given application */
+ private boolean isFailing(Change change, JobStatus status) {
+ return status != null &&
+ !status.isSuccess() &&
+ status.lastCompletedFor(change);
}
private boolean isCapacityConstrained(JobType jobType) {
return jobType == JobType.stagingTest || jobType == JobType.systemTest;
}
- private boolean shouldRetryFromBeginning(Application application) {
- Instant eightHoursAgo = clock.instant().minus(Duration.ofHours(8));
- Instant failingSince = application.deploymentJobs().failingSince();
- if (failingSince != null && failingSince.isAfter(eightHoursAgo)) return false;
-
- JobStatus componentJobStatus = application.deploymentJobs().jobStatus().get(JobType.component);
- if (componentJobStatus == null) return true;
- if ( ! componentJobStatus.lastCompleted().isPresent() ) return true;
- return componentJobStatus.lastCompleted().get().at().isBefore(eightHoursAgo);
+ /** Returns the first job that has been running for more than the given timeout */
+ private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs, Duration timeout) {
+ Instant startOfGracePeriod = controller.clock().instant().minus(timeout);
+ Optional<JobStatus> oldestRunningJob = jobs.jobStatus().values().stream()
+ .filter(JobStatus::inProgress)
+ .sorted(Comparator.comparing(status -> status.lastTriggered().get().at()))
+ .findFirst();
+ return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(startOfGracePeriod));
}
/** Decide whether the job should be triggered by the periodic trigger */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
index 43c1755af34..c8831bd6a56 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
@@ -4,13 +4,9 @@ package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
-import com.yahoo.vespa.hosted.controller.application.JobStatus;
import java.time.Duration;
-import java.time.Instant;
-import java.util.Comparator;
import java.util.List;
-import java.util.Optional;
/**
* Attempts redeployment of failed jobs and deployments.
@@ -31,57 +27,11 @@ public class FailureRedeployer extends Maintainer {
List<Application> applications = ApplicationList.from(controller().applications().asList())
.notPullRequest()
.asList();
- retryFailingJobs(applications);
- retryStuckJobs(applications);
+ applications.forEach(application -> triggerFailing(application, jobTimeout));
}
- private void retryFailingJobs(List<Application> applications) {
- for (Application application : applications) {
- if (!application.deploying().isPresent()) {
- continue;
- }
- if (application.deploymentJobs().inProgress()) {
- continue;
- }
- Optional<JobStatus> failingJob = jobFailingFor(application);
- failingJob.ifPresent(job -> triggerFailing(application, "Job " + job.type().id() +
- " has been failing since " + job.firstFailing().get()));
- }
- }
-
- private void retryStuckJobs(List<Application> applications) {
- Instant startOfGracePeriod = controller().clock().instant().minus(jobTimeout);
- for (Application application : applications) {
- Optional<JobStatus> job = oldestRunningJob(application);
- if (!job.isPresent()) {
- continue;
- }
- // Ignore job if it doesn't belong to a zone in this system
- if (!job.get().type().zone(controller().system()).isPresent()) {
- continue;
- }
- if (job.get().lastTriggered().get().at().isBefore(startOfGracePeriod)) {
- triggerFailing(application, "Job " + job.get().type().id() +
- " has been running for more than " + jobTimeout);
- }
- }
- }
-
- private Optional<JobStatus> jobFailingFor(Application application) {
- return application.deploymentJobs().jobStatus().values().stream()
- .filter(status -> !status.isSuccess() && status.lastCompletedFor(application.deploying().get()))
- .findFirst();
- }
-
- private Optional<JobStatus> oldestRunningJob(Application application) {
- return application.deploymentJobs().jobStatus().values().stream()
- .filter(JobStatus::inProgress)
- .sorted(Comparator.comparing(status -> status.lastTriggered().get().at()))
- .findFirst();
- }
-
- private void triggerFailing(Application application, String cause) {
- controller().applications().deploymentTrigger().triggerFailing(application.id(), cause);
+ private void triggerFailing(Application application, Duration timeout) {
+ controller().applications().deploymentTrigger().triggerFailing(application.id(), timeout);
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
index 54ef5df8ac0..fd742ab983a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
@@ -11,6 +11,7 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
+import java.time.Instant;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -21,6 +22,8 @@ import java.util.logging.Logger;
*/
public class Upgrader extends Maintainer {
+ private static final Duration upgradeTimeout = Duration.ofHours(12);
+
private static final Logger log = Logger.getLogger(Upgrader.class.getName());
public Upgrader(Controller controller, Duration interval, JobControl jobControl) {
@@ -62,12 +65,13 @@ public class Upgrader extends Maintainer {
private void upgrade(ApplicationList applications, Version version) {
Change.VersionChange change = new Change.VersionChange(version);
+ Instant startOfUpgradePeriod = controller().clock().instant().minus(upgradeTimeout);
cancelUpgradesOf(applications.upgradingToLowerThan(version));
applications = applications.notPullRequest(); // Pull requests are deployed as separate applications to test then deleted; No need to upgrade
applications = applications.onLowerVersionThan(version);
applications = applications.notDeployingApplication(); // wait with applications deploying an application change
applications = applications.notFailingOn(version); // try to upgrade only if it hasn't failed on this version
- applications = applications.notRunningJobFor(change); // do not trigger multiple jobs simultaneously for same upgrade
+ applications = applications.notCurrentlyUpgrading(change, startOfUpgradePeriod); // do not trigger again if currently upgrading
applications = applications.canUpgradeAt(controller().clock().instant()); // wait with applications that are currently blocking upgrades
for (Application application : applications.byIncreasingDeployedVersion().asList()) {
try {
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
index 0c0cc0485c8..5a48bc54b49 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
@@ -47,22 +47,29 @@ public class DeploymentTriggerTest {
tester.updateVersionStatus(version);
tester.upgrader().maintain();
+ // system-test fails and is retried
tester.deployAndNotify(app, applicationPackage, false, JobType.systemTest);
assertEquals("Retried immediately", 1, tester.buildSystem().jobs().size());
-
tester.buildSystem().takeJobsToRun();
assertEquals("Job removed", 0, tester.buildSystem().jobs().size());
- tester.clock().advance(Duration.ofHours(2));
+ tester.clock().advance(Duration.ofHours(4).plus(Duration.ofSeconds(1)));
tester.failureRedeployer().maintain();
+
assertEquals("Retried job", 1, tester.buildSystem().jobs().size());
assertEquals(JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName());
-
tester.buildSystem().takeJobsToRun();
assertEquals("Job removed", 0, tester.buildSystem().jobs().size());
+
+ // system-test succeeds and staging-test starts
+ tester.failureRedeployer().maintain();
+ tester.deployAndNotify(app, applicationPackage, true, JobType.systemTest);
+
+ // staging-test times out and is retried
+ tester.buildSystem().takeJobsToRun();
tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1)));
tester.failureRedeployer().maintain();
- assertEquals("Retried from the beginning", 1, tester.buildSystem().jobs().size());
- assertEquals(JobType.component.id(), tester.buildSystem().jobs().get(0).jobName());
+ assertEquals("Retried dead job", 1, tester.buildSystem().jobs().size());
+ assertEquals(JobType.stagingTest.id(), tester.buildSystem().jobs().get(0).jobName());
}
@Test
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java
index 2d1b7463dea..286db864c22 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java
@@ -13,7 +13,6 @@ import com.yahoo.vespa.hosted.controller.application.DeploymentJobs;
import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester;
import com.yahoo.vespa.hosted.controller.persistence.ApplicationSerializer;
-import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import org.junit.Test;
import java.nio.file.Files;
@@ -116,81 +115,16 @@ public class FailureRedeployerTest {
tester.failureRedeployer().maintain();
assertTrue("No jobs retried", tester.buildSystem().jobs().isEmpty());
- // Just over 12 hours pass, deployment is retried from beginning
+ // Just over 12 hours pass, job is retried
tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1)));
tester.failureRedeployer().maintain();
- assertEquals(DeploymentJobs.JobType.component.id(), tester.buildSystem().takeJobsToRun().get(0).jobName());
-
- // Ensure that system-test is triggered after component. Triggering component records a new change, but in this
- // case there's already a change in progress which we want to discard and start over
- tester.notifyJobCompletion(DeploymentJobs.JobType.component, app, true);
- assertEquals(DeploymentJobs.JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName());
- }
-
- @Test
- public void testAlwaysRestartsDeploymentOfApplicationsWithStuckJobs() {
- DeploymentTester tester = new DeploymentTester();
- Version version = Version.fromString("5.0");
- tester.updateVersionStatus(version);
-
- ApplicationPackage applicationPackage = new ApplicationPackageBuilder()
- .environment(Environment.prod)
- .region("us-west-1")
- .build();
-
- // Setup applications
- Application canary0 = tester.createAndDeploy("canary0", 0, "canary");
- Application canary1 = tester.createAndDeploy("canary1", 1, "canary");
- Application default0 = tester.createAndDeploy("default0", 2, "default");
- Application default1 = tester.createAndDeploy("default1", 3, "default");
- Application default2 = tester.createAndDeploy("default2", 4, "default");
- Application default3 = tester.createAndDeploy("default3", 5, "default");
- Application default4 = tester.createAndDeploy("default4", 6, "default");
-
- // New version is released
- version = Version.fromString("5.1");
- tester.updateVersionStatus(version);
- assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber());
- tester.upgrader().maintain();
-
- // Canaries upgrade and raise confidence
- tester.completeUpgrade(canary0, version, "canary");
- tester.completeUpgrade(canary1, version, "canary");
- tester.updateVersionStatus(version);
- assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
-
- // Applications with default policy start upgrading
- tester.clock().advance(Duration.ofMinutes(1));
- tester.upgrader().maintain();
- assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size());
-
- // 4/5 applications fail, confidence is lowered and upgrade is cancelled
- tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest);
- tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest);
- tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest);
- tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest);
- tester.updateVersionStatus(version);
- assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence());
- tester.upgrader().maintain();
-
- // 5th app never reports back and has a dead locked job, but no ongoing change
- Application deadLocked = tester.applications().require(default4.id());
- assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress());
- assertFalse("No change present", deadLocked.deploying().isPresent());
-
- // 4/5 applications are repaired and confidence is restored
- tester.deployCompletely(default0, applicationPackage);
- tester.deployCompletely(default1, applicationPackage);
- tester.deployCompletely(default2, applicationPackage);
- tester.deployCompletely(default3, applicationPackage);
- tester.updateVersionStatus(version);
- assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
+ assertEquals(DeploymentJobs.JobType.stagingTest.id(), tester.buildSystem().takeJobsToRun().get(0).jobName());
- // Over 12 hours pass and failure redeployer restarts deployment of 5th app
- tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1)));
- tester.failureRedeployer().maintain();
- assertEquals("Deployment is restarted", DeploymentJobs.JobType.component.id(),
- tester.buildSystem().jobs().get(0).jobName());
+ // Deployment completes
+ tester.deploy(DeploymentJobs.JobType.stagingTest, app, applicationPackage, true);
+ tester.notifyJobCompletion(DeploymentJobs.JobType.stagingTest, app, true);
+ tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.productionUsEast3);
+ assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty());
}
@Test
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
index 6709098ba06..5be030b4fd9 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
@@ -400,4 +400,70 @@ public class UpgraderTest {
assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty());
}
+ @Test
+ public void testReschedulesUpgradeAfterTimeout() {
+ DeploymentTester tester = new DeploymentTester();
+ Version version = Version.fromString("5.0");
+ tester.updateVersionStatus(version);
+
+ ApplicationPackage applicationPackage = new ApplicationPackageBuilder()
+ .environment(Environment.prod)
+ .region("us-west-1")
+ .build();
+
+ // Setup applications
+ Application canary0 = tester.createAndDeploy("canary0", 0, "canary");
+ Application canary1 = tester.createAndDeploy("canary1", 1, "canary");
+ Application default0 = tester.createAndDeploy("default0", 2, "default");
+ Application default1 = tester.createAndDeploy("default1", 3, "default");
+ Application default2 = tester.createAndDeploy("default2", 4, "default");
+ Application default3 = tester.createAndDeploy("default3", 5, "default");
+ Application default4 = tester.createAndDeploy("default4", 6, "default");
+
+ // New version is released
+ version = Version.fromString("5.1");
+ tester.updateVersionStatus(version);
+ assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber());
+ tester.upgrader().maintain();
+
+ // Canaries upgrade and raise confidence
+ tester.completeUpgrade(canary0, version, "canary");
+ tester.completeUpgrade(canary1, version, "canary");
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
+
+ // Applications with default policy start upgrading
+ tester.clock().advance(Duration.ofMinutes(1));
+ tester.upgrader().maintain();
+ assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size());
+
+ // 4/5 applications fail, confidence is lowered and upgrade is cancelled
+ tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence());
+ tester.upgrader().maintain();
+
+ // 5th app never reports back and has a dead locked job, but no ongoing change
+ Application deadLocked = tester.applications().require(default4.id());
+ assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress());
+ assertFalse("No change present", deadLocked.deploying().isPresent());
+
+ // 4/5 applications are repaired and confidence is restored
+ tester.deployCompletely(default0, applicationPackage);
+ tester.deployCompletely(default1, applicationPackage);
+ tester.deployCompletely(default2, applicationPackage);
+ tester.deployCompletely(default3, applicationPackage);
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
+
+ // Over 12 hours pass and upgrade is rescheduled for 5th app
+ tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1)));
+ tester.upgrader().maintain();
+ assertEquals("Upgrade is rescheduled", DeploymentJobs.JobType.systemTest.id(),
+ tester.buildSystem().jobs().get(0).jobName());
+ }
+
}