diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2018-03-27 14:21:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-03-27 14:21:30 +0200 |
commit | 37702d7e5c74f1da1f8a181a7406f5e0a5f9415f (patch) | |
tree | 10362aa59c4c461c3e78ecada0d733099edfc378 | |
parent | 945db42c14146695825e7236bc40205cef8c27cd (diff) | |
parent | 0b3b1301e75c5b56fb354b7b054a764660608f82 (diff) |
Merge pull request #5430 from vespa-engine/jvenstad/DO-unified
Jvenstad/do unified
14 files changed, 188 insertions, 218 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java index 5823dd160c0..1d3fff57a78 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java @@ -553,6 +553,11 @@ public class ApplicationController { } public void notifyJobCompletion(JobReport report) { + log.log(Level.INFO, String.format("Notified of %s of %s %d for '%s'.", + report.jobError().map(error -> error + " failure").orElse("success"), + report.jobType(), + report.buildNumber(), + report.applicationId())); if ( ! get(report.applicationId()).isPresent()) { log.log(Level.WARNING, "Ignoring completion of job of project '" + report.projectId() + "': Unknown application '" + report.applicationId() + "'"); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 767ffbaa7ea..f6f65df56b7 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -87,17 +87,12 @@ public class DeploymentTrigger { // Handle successful starting and ending if (report.jobType() == JobType.component) { if (report.success()) { - if ( ! acceptNewApplicationVersionNow(application)) { - applications().store(application.withOutstandingChange(Change.of(applicationVersion))); - return; - } - // Note that in case of an ongoing upgrade this may result in both the upgrade and application - // change being deployed together - application = application.withChange(application.change().with(applicationVersion)); - } - else { // don't re-trigger component on failure - applications().store(application); - return; + if ( ! acceptNewApplicationVersionNow(application)) + application = application.withOutstandingChange(Change.of(applicationVersion)); + else + // Note that in case of an ongoing upgrade this may result in both the upgrade and application + // change being deployed together + application = application.withChange(application.change().with(applicationVersion)); } } else if (report.jobType().isProduction() && deploymentComplete(application)) { @@ -106,21 +101,6 @@ public class DeploymentTrigger { application = application.withChange(Change.empty()); } - // TODO jvenstad: Don't trigger. - // Trigger next - if (report.success()) { - triggerReadyJobs(application); - return; // Don't overwrite below. - } - else if (retryBecauseOutOfCapacity(application, report.jobType())) { - triggerReadyJobs(application); - return; // Don't overwrite below. - } - else if (retryBecauseNewFailure(application, report.jobType())) { - triggerReadyJobs(application); - return; // Don't overwrite below. - } - applications().store(application); }); } @@ -135,45 +115,6 @@ public class DeploymentTrigger { applications().lockIfPresent(application.id(), this::triggerReadyJobs); } - /** Find the next step to trigger if any, and triggers it */ - public void triggerReadyJobs(LockedApplication application) { - List<JobType> jobs = order.jobsFrom(application.deploymentSpec()); - - // Should the first step be triggered? - if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) ) { - JobStatus systemTestStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest); - if (application.change().platform().isPresent()) { - Version target = application.change().platform().get(); - if (systemTestStatus == null - || ! systemTestStatus.lastTriggered().isPresent() - || ! systemTestStatus.isSuccess() - || ! systemTestStatus.lastTriggered().get().version().equals(target) - || systemTestStatus.isHanging(jobTimeoutLimit())) { - application = trigger(new Triggering(application, JobType.systemTest, false, "Upgrade to " + target), Collections.emptySet(), false); - applications().store(application); - } - } - } - - // Find next steps to trigger based on the state of the previous step - for (JobType jobType : (Iterable<JobType>) Stream.concat(Stream.of(JobType.component), jobs.stream())::iterator) { - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - if (jobStatus == null) continue; // job has never run - - // Collect the subset of next jobs which have not run with the last changes - // TODO jvenstad: Change to be step-centric. - List<JobType> nextJobs = order.nextAfter(jobType, application); - for (JobType nextJobType : nextJobs) { - JobStatus nextStatus = application.deploymentJobs().jobStatus().get(nextJobType); - if (changesAvailable(application, jobStatus, nextStatus) || nextStatus.isHanging(jobTimeoutLimit())) { - boolean isRetry = nextStatus != null && nextStatus.jobError().filter(JobError.outOfCapacity::equals).isPresent(); - application = trigger(new Triggering(application, nextJobType, isRetry, isRetry ? "Retrying on out of capacity" : "Available change in " + jobType.jobName()), nextJobs, false); - } - } - applications().store(application); - } - } - /** * Trigger a job for an application, if allowed * @@ -226,8 +167,7 @@ public class DeploymentTrigger { application = application.withChange(change); if (change.application().isPresent()) application = application.withOutstandingChange(Change.empty()); - // TODO jvenstad: Don't trigger. - application = trigger(new Triggering(application, JobType.systemTest, false, change.toString()), Collections.emptySet(), false); + applications().store(application); }); } @@ -250,22 +190,47 @@ public class DeploymentTrigger { //--- End of methods which triggers deployment jobs ---------------------------- - private ApplicationController applications() { return controller.applications(); } + /** Find the next step to trigger if any, and triggers it */ + private void triggerReadyJobs(LockedApplication application) { + List<JobType> jobs = order.jobsFrom(application.deploymentSpec()); - /** Retry immediately only if this job just started failing. Otherwise retry periodically */ - private boolean retryBecauseNewFailure(Application application, JobType jobType) { - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - return (jobStatus != null && jobStatus.firstFailing().get().at().isAfter(clock.instant().minus(Duration.ofSeconds(10)))); - } + // Should the first step be triggered? + if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) ) { + JobStatus systemTestStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest); + if (application.change().platform().isPresent()) { + Version target = application.change().platform().get(); + if (systemTestStatus == null + || ! systemTestStatus.lastTriggered().isPresent() + || ! systemTestStatus.isSuccess() + || ! systemTestStatus.lastTriggered().get().version().equals(target) + || systemTestStatus.isHanging(jobTimeoutLimit())) { + application = trigger(new Triggering(application, JobType.systemTest, false, "Upgrade to " + target), Collections.emptySet(), false); + applications().store(application); + } + } + } + + // Find next steps to trigger based on the state of the previous step + for (JobType jobType : (Iterable<JobType>) Stream.concat(Stream.of(JobType.component), jobs.stream())::iterator) { + JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); + if (jobStatus == null) continue; // job has never run - /** Decide whether to retry due to capacity restrictions */ - private boolean retryBecauseOutOfCapacity(Application application, JobType jobType) { - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - if (jobStatus == null || ! jobStatus.jobError().equals(Optional.of(JobError.outOfCapacity))) return false; - // Retry the job if it failed recently - return jobStatus.firstFailing().get().at().isAfter(clock.instant().minus(Duration.ofMinutes(15))); + // Collect the subset of next jobs which have not run with the last changes + // TODO jvenstad: Change to be step-centric. + List<JobType> nextJobs = order.nextAfter(jobType, application); + for (JobType nextJobType : nextJobs) { + JobStatus nextStatus = application.deploymentJobs().jobStatus().get(nextJobType); + if (changesAvailable(application, jobStatus, nextStatus) || nextStatus.isHanging(jobTimeoutLimit())) { + boolean isRetry = nextStatus != null && nextStatus.jobError().filter(JobError.outOfCapacity::equals).isPresent(); + application = trigger(new Triggering(application, nextJobType, isRetry, isRetry ? "Retrying on out of capacity" : "Available change in " + jobType.jobName()), nextJobs, false); + } + } + applications().store(application); + } } + private ApplicationController applications() { return controller.applications(); } + /** Returns whether the given job type should be triggered according to deployment spec */ private boolean hasJob(JobType jobType, Application application) { if ( ! jobType.isProduction()) return true; // Deployment spec only determines this for production jobs. diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java index 895ba195b08..5c24b70fd65 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java @@ -142,9 +142,6 @@ public class ControllerTest { tester.clock().advance(Duration.ofHours(1)); - // Need to complete the job, or new jobs won't start. - tester.jobCompletion(productionCorpUsEast1).application(app1).unsuccessful().submit(); - // system and staging test job - succeeding tester.jobCompletion(component).application(app1).nextBuildNumber().uploadArtifact(applicationPackage).submit(); applicationVersion = tester.application("app1").change().application().get(); @@ -156,6 +153,7 @@ public class ControllerTest { tester.deployAndNotify(app1, applicationPackage, true, stagingTest); // production job succeeding now + tester.jobCompletion(productionCorpUsEast1).application(app1).unsuccessful().submit(); tester.deployAndNotify(app1, applicationPackage, true, productionCorpUsEast1); expectedJobStatus = expectedJobStatus .withTriggering(version1, applicationVersion, "", tester.clock().instant().minus(Duration.ofMillis(1))) @@ -280,6 +278,7 @@ public class ControllerTest { // Version upgrade changes system version applications.deploymentTrigger().triggerChange(app1.id(), Change.of(newSystemVersion)); + tester.deploymentTrigger().triggerReadyJobs(); tester.deployAndNotify(app1, applicationPackage, true, systemTest); tester.deployAndNotify(app1, applicationPackage, true, stagingTest); tester.deployAndNotify(app1, applicationPackage, true, productionUsWest1); @@ -450,17 +449,6 @@ public class ControllerTest { tester.deployAndNotify(app3, applicationPackage, true, stagingTest); tester.deployAndNotify(app3, applicationPackage, true, productionCorpUsEast1); - // app1: 15 minutes pass, staging-test job is still failing due out of capacity, but is no longer re-queued by - // out of capacity retry mechanism - tester.clock().advance(Duration.ofMinutes(15)); - tester.jobCompletion(stagingTest).application(app1).error(JobError.outOfCapacity).submit(); // Clear the previous staging test - tester.jobCompletion(component).application(app1).nextBuildNumber().uploadArtifact(applicationPackage).submit(); - tester.deployAndNotify(app1, applicationPackage, true, false, systemTest); - tester.deploy(stagingTest, app1, applicationPackage); - assertEquals(1, deploymentQueue.takeJobsToRun().size()); - tester.jobCompletion(stagingTest).application(app1).error(JobError.outOfCapacity).submit(); - assertTrue("No jobs queued", deploymentQueue.jobs().isEmpty()); - // app2 and app3: New change triggers system-test jobs // Provide a changed application package, too, or the deployment is a no-op. tester.jobCompletion(component).application(app2).nextBuildNumber().uploadArtifact(applicationPackage).submit(); @@ -469,14 +457,6 @@ public class ControllerTest { tester.jobCompletion(component).application(app3).nextBuildNumber().uploadArtifact(applicationPackage).submit(); tester.deployAndNotify(app3, applicationPackage2, true, systemTest); - assertEquals(2, deploymentQueue.jobs().size()); - - // app1: 4 hours pass in total, staging-test job for app1 is re-queued by periodic trigger mechanism and added at the - // front of the queue - tester.clock().advance(Duration.ofHours(3)); - tester.clock().advance(Duration.ofMinutes(50)); - tester.readyJobTrigger().maintain(); - assertEquals(Collections.singletonList(new BuildService.BuildJob(project1, stagingTest.jobName())), deploymentQueue.takeJobsToRun()); assertEquals(Collections.singletonList(new BuildService.BuildJob(project2, stagingTest.jobName())), deploymentQueue.takeJobsToRun()); assertEquals(Collections.singletonList(new BuildService.BuildJob(project3, stagingTest.jobName())), deploymentQueue.takeJobsToRun()); @@ -578,6 +558,7 @@ public class ControllerTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // Test environments pass tester.deploy(DeploymentJobs.JobType.systemTest, application, applicationPackage); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java index 92bf22df535..6bc544605b6 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java @@ -101,6 +101,7 @@ public class DeploymentTester { configServer().setDefaultVersion(version); updateVersionStatus(version); upgrader().maintain(); + readyJobTrigger().maintain(); } public Version defaultVespaVersion() { @@ -275,7 +276,7 @@ public class DeploymentTester { assertEquals(job.jobName(), buildJob.jobName()); } if (expectOnlyTheseJobs) - assertEquals(jobs.length, countJobsOf(application)); + assertEquals("Unexpected job queue: " + jobsOf(application), jobs.length, jobsOf(application).size()); deploymentQueue().removeJobs(application.id()); } @@ -286,15 +287,17 @@ public class DeploymentTester { throw new IllegalArgumentException(jobType + " is not scheduled for " + application); } - private int countJobsOf(Application application) { - return (int) deploymentQueue().jobs().stream() - .filter(job -> job.projectId() == application.deploymentJobs().projectId().get()) - .count(); + private List<JobType> jobsOf(Application application) { + return deploymentQueue().jobs().stream() + .filter(job -> job.projectId() == application.deploymentJobs().projectId().get()) + .map(buildJob -> JobType.fromJobName(buildJob.jobName())) + .collect(Collectors.toList()); } private void notifyJobCompletion(DeploymentJobs.JobReport report) { clock().advance(Duration.ofMillis(1)); applications().notifyJobCompletion(report); + applications().deploymentTrigger().triggerReadyJobs(); } public static ApplicationPackage applicationPackage(String upgradePolicy) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index ce765249b97..364cb66c3d1 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -55,6 +55,7 @@ public class DeploymentTriggerTest { Version version = new Version(5, 1); tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // Deploy completely once tester.jobCompletion(component).application(app).uploadArtifact(applicationPackage).submit(); @@ -66,17 +67,11 @@ public class DeploymentTriggerTest { version = new Version(5, 2); tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // system-test fails and is retried tester.deployAndNotify(app, applicationPackage, false, JobType.systemTest); - assertEquals("Retried immediately", 1, tester.deploymentQueue().jobs().size()); - tester.clock().advance(Duration.ofHours(1)); - tester.deployAndNotify(app, applicationPackage, false, JobType.systemTest); - tester.clock().advance(Duration.ofHours(1)); - assertEquals("Nothing scheduled", 0, tester.deploymentQueue().jobs().size()); - tester.readyJobTrigger().maintain(); // Causes retry of systemTests - - assertEquals("Scheduled retry", 1, tester.deploymentQueue().jobs().size()); + assertEquals("Job is retried on failure", 1, tester.deploymentQueue().jobs().size()); tester.deployAndNotify(app, applicationPackage, true, JobType.systemTest); // staging-test times out and is retried @@ -390,28 +385,22 @@ public class DeploymentTriggerTest { tester.upgradeSystem(version1); tester.completeUpgradeWithError(application, version1, applicationPackage, productionEuWest1); - // Exhaust the retry, so productionEuWest1 is no longer running. - tester.clock().advance(Duration.ofHours(1)); - tester.deployAndNotify(application, Optional.empty(), false, true, productionEuWest1); - assertTrue(tester.deploymentQueue().jobs().isEmpty()); - // Deploy the new application version, even though the platform version is already deployed in us-central-1. // Let it fail in us-central-1 after deployment, so we can test this zone is later skipped. - tester.completeDeploymentWithError(application, applicationPackage, BuildJob.defaultBuildNumber + 1, productionUsCentral1); + tester.jobCompletion(component).application(application).nextBuildNumber().uploadArtifact(applicationPackage).submit(); + tester.deployAndNotify(application, applicationPackage, true, false, systemTest); + tester.deployAndNotify(application, applicationPackage, true, false, stagingTest); + tester.jobCompletion(productionEuWest1).application(application).unsuccessful().submit(); tester.deploy(productionUsCentral1, application, Optional.empty(), false); + // Deploying before notifying here makes the job not re-trigger, but instead triggers the next job (because of triggerReadyJobs() in notification.) + tester.deployAndNotify(application, applicationPackage, false, productionUsCentral1); assertEquals(ApplicationVersion.from(BuildJob.defaultSourceRevision, BuildJob.defaultBuildNumber + 1), app.get().deployments().get(ZoneId.from("prod.us-central-1")).applicationVersion()); - // Exhaust the automatic retry. - tester.clock().advance(Duration.ofHours(1)); - tester.deployAndNotify(application, Optional.empty(), false, true, productionUsCentral1); - assertTrue(tester.deploymentQueue().jobs().isEmpty()); - - // Let the ReadyJobTrigger get what it thinks is the next job -- should be the last job. - tester.readyJobTrigger().maintain(); assertEquals(Collections.singletonList(new BuildService.BuildJob(1, productionEuWest1.jobName())), tester.deploymentQueue().jobs()); + tester.deploy(productionEuWest1, application, Optional.empty(), false); tester.deployAndNotify(application, Optional.empty(), false, true, productionEuWest1); assertFalse(app.get().change().isPresent()); @@ -438,11 +427,6 @@ public class DeploymentTriggerTest { tester.deployAndNotify(application, Optional.empty(), false, true, productionUsCentral1); tester.deploy(productionUsCentral1, application, Optional.empty(), false); - // Exhaust the automatic retry. - tester.clock().advance(Duration.ofHours(1)); - tester.deployAndNotify(application, Optional.empty(), false, true, productionUsCentral1); - assertTrue(tester.deploymentQueue().jobs().isEmpty()); - ApplicationVersion appVersion1 = ApplicationVersion.from(BuildJob.defaultSourceRevision, BuildJob.defaultBuildNumber + 1); assertEquals(appVersion1, app.get().deployments().get(ZoneId.from("prod.us-central-1")).applicationVersion()); @@ -450,13 +434,14 @@ public class DeploymentTriggerTest { tester.deploymentTrigger().cancelChange(application.id(), true); assertEquals(Change.of(appVersion1), app.get().change()); - // Now cancel the change -- this should not normally happen. + // Now cancel the change as is done through the web API. tester.deploymentTrigger().cancelChange(application.id(), false); assertEquals(Change.empty(), app.get().change()); // A new version is released, which should now deploy the currently deployed application version to avoid downgrades. Version version1 = new Version("6.2"); tester.upgradeSystem(version1); + tester.jobCompletion(productionUsCentral1).application(application).unsuccessful().submit(); tester.completeUpgrade(application, version1, applicationPackage); assertEquals(appVersion1, app.get().deployments().get(ZoneId.from("prod.us-central-1")).applicationVersion()); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java index 548f7c33fa1..f088c4216ba 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java @@ -148,6 +148,7 @@ public class DeploymentIssueReporterTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); tester.completeUpgradeWithError(app2, version, canaryPackage, systemTest); tester.updateVersionStatus(version); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java index 092cdcd6984..429a0da6543 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java @@ -23,6 +23,7 @@ import java.util.Collections; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.component; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.productionUsEast3; +import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -54,6 +55,7 @@ public class FailureRedeployerTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // Test environments pass tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.systemTest); @@ -68,14 +70,14 @@ public class FailureRedeployerTest { // Another version is released, which cancels any pending upgrades to lower versions version = Version.fromString("5.2"); tester.updateVersionStatus(version); - tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.productionUsEast3); // Finish previous production job. tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Application starts upgrading to new version", 1, tester.deploymentQueue().jobs().size()); assertEquals("Application has pending upgrade to " + version, version, tester.application(app.id()).change().platform().get()); - // Failure redeployer does not retry failing job for prod.us-east-3 as there's an ongoing deployment + // Failure re-deployer does not retry failing job for prod.us-east-3, since it no longer has an available change tester.clock().advance(Duration.ofMinutes(1)); - tester.readyJobTrigger().maintain(); + tester.jobCompletion(DeploymentJobs.JobType.productionUsEast3).application(app).unsuccessful().submit(); assertFalse("Job is not retried", tester.deploymentQueue().jobs().stream() .anyMatch(j -> j.jobName().equals(DeploymentJobs.JobType.productionUsEast3.jobName()))); @@ -83,16 +85,8 @@ public class FailureRedeployerTest { tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.systemTest); tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.stagingTest); - // Production job fails again and exhausts all immediate retries + // Production job fails again, and is retried tester.deployAndNotify(app, applicationPackage, false, DeploymentJobs.JobType.productionUsEast3); - tester.deploymentQueue().takeJobsToRun(); - tester.clock().advance(Duration.ofMinutes(10)); - tester.jobCompletion(DeploymentJobs.JobType.productionUsEast3).application(app).unsuccessful().submit(); - assertTrue("Retries exhausted", tester.deploymentQueue().jobs().isEmpty()); - assertTrue("Failure is recorded", tester.application(app.id()).deploymentJobs().hasFailures()); - - // Failure redeployer retries job - tester.clock().advance(Duration.ofMinutes(5)); tester.readyJobTrigger().maintain(); assertEquals("Job is retried", Collections.singletonList(new BuildService.BuildJob(app.deploymentJobs().projectId().get(), productionUsEast3.jobName())), tester.deploymentQueue().jobs()); @@ -154,29 +148,25 @@ public class FailureRedeployerTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Application has pending upgrade to " + version, version, tester.application(app.id()).change().platform().get()); - // system-test fails and exhausts all immediate retries + // system-test fails and is left with a retry tester.deployAndNotify(app, applicationPackage, false, DeploymentJobs.JobType.systemTest); - tester.deploymentQueue().takeJobsToRun(); - tester.clock().advance(Duration.ofMinutes(10)); - tester.jobCompletion(DeploymentJobs.JobType.systemTest).application(app).unsuccessful().submit(); - assertTrue("Retries exhausted", tester.deploymentQueue().jobs().isEmpty()); // Another version is released version = Version.fromString("5.2"); tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); + + // Job is left "running", so needs to time out before it can be retried. + tester.clock().advance(Duration.ofHours(13)); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Application has pending upgrade to " + version, version, tester.application(app.id()).change().platform().get()); - // Consume system-test job for 5.2 - tester.deploymentQueue().takeJobsToRun(); - - // Failure re-deployer does not retry failing system-test job as it failed for an older change - tester.clock().advance(Duration.ofMinutes(5)); - tester.readyJobTrigger().maintain(); - assertTrue("No jobs retried", tester.deploymentQueue().jobs().isEmpty()); + // Cancellation of outdated version and triggering on a new version is done by the upgrader. + assertEquals(version, tester.application(app.id()).deploymentJobs().jobStatus().get(systemTest).lastTriggered().get().version()); } @Test @@ -207,6 +197,7 @@ public class FailureRedeployerTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // Test environments pass tester.deploy(DeploymentJobs.JobType.systemTest, application, applicationPackage); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployerTest.java index 12fb2b6c862..a4e464a065f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployerTest.java @@ -42,6 +42,7 @@ public class OutstandingChangeDeployerTest { Version version = new Version(6, 2); tester.deploymentTrigger().triggerChange(tester.application("app1").id(), Change.of(version)); + tester.deploymentTrigger().triggerReadyJobs(); assertEquals(Change.of(version), tester.application("app1").change()); assertFalse(tester.application("app1").outstandingChange().isPresent()); @@ -59,6 +60,7 @@ public class OutstandingChangeDeployerTest { assertEquals(1, tester.deploymentQueue().jobs().size()); deployer.maintain(); + tester.deploymentTrigger().triggerReadyJobs(); assertEquals("No effect as job is in progress", 1, tester.deploymentQueue().jobs().size()); assertEquals("1.0.43-cafed00d", app.outstandingChange().application().get().id()); @@ -68,6 +70,7 @@ public class OutstandingChangeDeployerTest { assertEquals("Upgrade done", 0, tester.deploymentQueue().jobs().size()); deployer.maintain(); + tester.deploymentTrigger().triggerReadyJobs(); app = tester.application("app1"); assertEquals("1.0.43-cafed00d", app.change().application().get().id()); List<BuildService.BuildJob> jobs = tester.deploymentQueue().jobs(); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 021fe0954c2..b7eff6d8448 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -21,6 +21,8 @@ import java.time.Duration; import java.time.Instant; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.component; +import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.productionEuWest1; +import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.productionUsEast3; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.productionUsWest1; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.stagingTest; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest; @@ -42,6 +44,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("No applications: Nothing to do", 0, tester.deploymentQueue().jobs().size()); // Setup applications @@ -53,6 +56,7 @@ public class UpgraderTest { Application conservative0 = tester.createAndDeploy("conservative0", 6, "conservative"); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("All already on the right version: Nothing to do", 0, tester.deploymentQueue().jobs().size()); // --- 5.1 is released - everything goes smoothly @@ -60,6 +64,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("New system version: Should upgrade Canaries", 2, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(canary0, version, "canary"); @@ -67,6 +72,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("One canary pending; nothing else", 1, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(canary1, version, "canary"); @@ -74,6 +80,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Canaries done: Should upgrade defaults", 3, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(default0, version, "default"); @@ -83,11 +90,13 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.high, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Normals done: Should upgrade conservatives", 1, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(conservative0, version, "conservative"); tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Nothing to do", 0, tester.deploymentQueue().jobs().size()); // --- 5.2 is released - which fails a Canary @@ -95,31 +104,25 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("New system version: Should upgrade Canaries", 2, tester.deploymentQueue().jobs().size()); tester.completeUpgradeWithError(canary0, version, "canary", DeploymentJobs.JobType.stagingTest); assertEquals("Other Canary was cancelled", 2, tester.deploymentQueue().jobs().size()); - // TODO: Cancelled would mean it was triggerd, removed from the build system, but never reported in. - // Thus, the expected number of jobs should be 1, above: the retrying canary0. - // Further, canary1 should be retried after the timeout period of 12 hours, but verifying this is - // not possible when jobs are consumed form the build system on notification, rather than on deploy. tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Version broken, but Canaries should keep trying", 2, tester.deploymentQueue().jobs().size()); - // Exhaust canary retries. - tester.jobCompletion(systemTest).application(canary1).unsuccessful().submit(); - tester.clock().advance(Duration.ofHours(1)); - tester.deployAndNotify(canary0, DeploymentTester.applicationPackage("canary"), false, DeploymentJobs.JobType.stagingTest); - tester.jobCompletion(systemTest).application(canary1).unsuccessful().submit(); - // --- A new version is released - which repairs the Canary app and fails a default + tester.clock().advance(Duration.ofHours(13)); version = Version.fromString("5.3"); tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("New system version: Should upgrade Canaries", 2, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(canary0, version, "canary"); @@ -127,6 +130,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("One canary pending; nothing else", 1, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(canary1, version, "canary"); @@ -134,6 +138,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Canaries done: Should upgrade defaults", 3, tester.deploymentQueue().jobs().size()); @@ -147,22 +152,27 @@ public class UpgraderTest { assertEquals("Upgrade with error should retry", 1, tester.deploymentQueue().jobs().size()); - // Finish previous run, with exhausted retry. - tester.clock().advance(Duration.ofHours(1)); - tester.jobCompletion(stagingTest).application(default0).unsuccessful().submit(); // --- Failing application is repaired by changing the application, causing confidence to move above 'high' threshold // Deploy application change - tester.deployCompletely(tester.application("default0"), DeploymentTester.applicationPackage("default"), BuildJob.defaultBuildNumber + 1); + tester.deploymentQueue().takeJobsToRun(); + tester.jobCompletion(component).application(default0).nextBuildNumber().uploadArtifact(DeploymentTester.applicationPackage("default")).submit(); + tester.jobCompletion(stagingTest).application(default0).unsuccessful().submit(); + tester.deployAndNotify(default0, "default", true, systemTest); + tester.deployAndNotify(default0, "default", true, stagingTest); + tester.deployAndNotify(default0, "default", true, productionUsWest1); + tester.deployAndNotify(default0, "default", true, productionUsEast3); tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.high, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Normals done: Should upgrade conservatives", 1, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(conservative0, version, "conservative"); tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Applications are on 5.3 - nothing to do", 0, tester.deploymentQueue().jobs().size()); // --- Starting upgrading to a new version which breaks, causing upgrades to commence on the previous version @@ -171,11 +181,13 @@ public class UpgraderTest { Application default4 = tester.createAndDeploy("default4", 5, "default"); tester.updateVersionStatus(version54); tester.upgrader().maintain(); // cause canary upgrades to 5.4 + tester.readyJobTrigger().maintain(); tester.completeUpgrade(canary0, version54, "canary"); tester.completeUpgrade(canary1, version54, "canary"); tester.updateVersionStatus(version54); assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Upgrade of defaults are scheduled", 5, tester.deploymentQueue().jobs().size()); assertEquals(version54, tester.application(default0.id()).change().platform().get()); @@ -189,11 +201,13 @@ public class UpgraderTest { Version version55 = Version.fromString("5.5"); tester.updateVersionStatus(version55); tester.upgrader().maintain(); // cause canary upgrades to 5.5 + tester.readyJobTrigger().maintain(); tester.completeUpgrade(canary0, version55, "canary"); tester.completeUpgrade(canary1, version55, "canary"); tester.updateVersionStatus(version55); assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Upgrade of defaults are scheduled", 5, tester.deploymentQueue().jobs().size()); assertEquals(version55, tester.application(default0.id()).change().platform().get()); @@ -206,13 +220,12 @@ public class UpgraderTest { tester.completeUpgrade(default2, version54, "default"); tester.completeUpgradeWithError(default3, version54, "default", DeploymentJobs.JobType.stagingTest); - // Exhaust immediate retries for upgrade - tester.clock().advance(Duration.ofHours(1)); - tester.jobCompletion(stagingTest).application(default3).unsuccessful().submit(); tester.completeUpgradeWithError(default4, version54, "default", DeploymentJobs.JobType.productionUsWest1); // State: Default applications started upgrading to 5.5 + tester.clock().advance(Duration.ofHours(13)); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); tester.completeUpgradeWithError(default0, version55, "default", DeploymentJobs.JobType.stagingTest); tester.completeUpgradeWithError(default1, version55, "default", DeploymentJobs.JobType.stagingTest); tester.completeUpgradeWithError(default2, version55, "default", DeploymentJobs.JobType.stagingTest); @@ -225,6 +238,7 @@ public class UpgraderTest { tester.jobCompletion(DeploymentJobs.JobType.productionUsWest1).application(default3).unsuccessful().submit(); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Upgrade of defaults are scheduled on 5.4 instead, since 5.5 broken: " + "This is default3 since it failed upgrade on both 5.4 and 5.5", 1, tester.deploymentQueue().jobs().size()); @@ -236,12 +250,14 @@ public class UpgraderTest { // --- Setup DeploymentTester tester = new DeploymentTester(); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("No system version: Nothing to do", 0, tester.deploymentQueue().jobs().size()); Version version = Version.fromString("5.0"); // (lower than the hardcoded version in the config server client) tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("No applications: Nothing to do", 0, tester.deploymentQueue().jobs().size()); // Setup applications @@ -259,6 +275,7 @@ public class UpgraderTest { Application default9 = tester.createAndDeploy("default9", 12, "default"); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("All already on the right version: Nothing to do", 0, tester.deploymentQueue().jobs().size()); // --- A new version is released @@ -266,6 +283,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("New system version: Should upgrade Canaries", 2, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(canary0, version, "canary"); @@ -273,6 +291,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("One canary pending; nothing else", 1, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(canary1, version, "canary"); @@ -280,6 +299,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Canaries done: Should upgrade defaults", 10, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(default0, version, "default"); @@ -291,6 +311,7 @@ public class UpgraderTest { // > 40% and at least 4 failed - version is broken tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); assertEquals("Upgrades are cancelled", 0, tester.deploymentQueue().jobs().size()); } @@ -313,6 +334,7 @@ public class UpgraderTest { tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.productionUsEast3); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Application is on expected version: Nothing to do", 0, tester.deploymentQueue().jobs().size()); @@ -321,16 +343,13 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // system-test completes successfully tester.deployAndNotify(app, applicationPackage, true, systemTest); - // staging-test fails multiple times, exhausts retries and failure is recorded + // staging-test fails and failure is recorded tester.deployAndNotify(app, applicationPackage, false, DeploymentJobs.JobType.stagingTest); - tester.deploymentQueue().takeJobsToRun(); - tester.clock().advance(Duration.ofMinutes(10)); - tester.jobCompletion(stagingTest).application(app).unsuccessful().submit(); - assertTrue("Retries exhausted", tester.deploymentQueue().jobs().isEmpty()); assertTrue("Failure is recorded", tester.application(app.id()).deploymentJobs().hasFailures()); assertTrue("Application has pending change", tester.application(app.id()).change().isPresent()); @@ -341,12 +360,14 @@ public class UpgraderTest { // Upgrade is scheduled. system-tests starts, but does not complete tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertTrue("Application still has failures", tester.application(app.id()).deploymentJobs().hasFailures()); assertEquals(1, tester.deploymentQueue().jobs().size()); tester.deploymentQueue().takeJobsToRun(); // Upgrader runs again, nothing happens as there's already a job in progress for this change tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertTrue("No more jobs triggered at this time", tester.deploymentQueue().jobs().isEmpty()); } @@ -370,6 +391,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // Canaries upgrade and raise confidence tester.completeUpgrade(canary0, version, "canary"); @@ -379,6 +401,7 @@ public class UpgraderTest { // Applications with default policy start upgrading tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Upgrade scheduled for remaining apps", 5, tester.deploymentQueue().jobs().size()); // 4/5 applications fail and lowers confidence @@ -389,6 +412,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // 5th app passes system-test, but does not trigger next job as upgrade is cancelled assertFalse("No change present", tester.applications().require(default4.id()).change().isPresent()); @@ -424,6 +448,7 @@ public class UpgraderTest { tester.updateVersionStatus(v1); assertEquals(v1, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // Canaries upgrade and raise confidence of V+1 (other apps are not upgraded) tester.completeUpgrade(canary0, v1, "canary"); @@ -436,6 +461,7 @@ public class UpgraderTest { tester.updateVersionStatus(v2); assertEquals(v2, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // We "manually" cancel upgrades to V1 so that we can use the applications to make V2 fail instead // But we keep one (default4) to avoid V1 being garbage collected @@ -453,6 +479,7 @@ public class UpgraderTest { // Applications with default policy start upgrading to V2 tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Upgrade scheduled for remaining apps", 5, tester.deploymentQueue().jobs().size()); // 4/5 applications fail (in the last prod zone) and lowers confidence @@ -466,6 +493,7 @@ public class UpgraderTest { assertEquals(v2, tester.application("default0").deployments().get(ZoneId.from("prod.us-west-1")).version()); assertEquals(v0, tester.application("default0").deployments().get(ZoneId.from("prod.us-east-3")).version()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Upgrade to 5.1 scheduled for apps not completely on 5.1 or 5.2", 5, tester.deploymentQueue().jobs().size()); tester.deploymentTrigger().triggerReadyJobs(); @@ -514,6 +542,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // Canaries upgrade and raise confidence tester.completeUpgrade(canary0, version, "canary"); @@ -523,6 +552,7 @@ public class UpgraderTest { // All applications upgrade successfully tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); tester.completeUpgrade(default0, version, "default"); tester.completeUpgrade(default1, version, "default"); tester.completeUpgrade(default2, version, "default"); @@ -565,16 +595,19 @@ public class UpgraderTest { // Application is not upgraded at this time tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertTrue("No jobs scheduled", tester.deploymentQueue().jobs().isEmpty()); // One hour passes, time is 19:00, still no upgrade tester.clock().advance(Duration.ofHours(1)); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertTrue("No jobs scheduled", tester.deploymentQueue().jobs().isEmpty()); // Two hours pass in total, time is 20:00 and application upgrades tester.clock().advance(Duration.ofHours(1)); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertFalse("Job is scheduled", tester.deploymentQueue().jobs().isEmpty()); tester.completeUpgrade(app, version, applicationPackage); assertTrue("All jobs consumed", tester.deploymentQueue().jobs().isEmpty()); @@ -608,6 +641,7 @@ public class UpgraderTest { // Application upgrade starts tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); tester.deployAndNotify(app, applicationPackage, true, systemTest); tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.stagingTest); clock.advance(Duration.ofHours(1)); // Entering block window after prod job is triggered @@ -638,9 +672,6 @@ public class UpgraderTest { public void testBlockVersionChangeHalfwayThoughThenNewVersion() { ManualClock clock = new ManualClock(Instant.parse("2017-09-29T16:00:00.00Z")); // Friday, 16:00 DeploymentTester tester = new DeploymentTester(new ControllerTester(clock)); - ReadyJobsTrigger readyJobsTrigger = new ReadyJobsTrigger(tester.controller(), - Duration.ofHours(1), - new JobControl(tester.controllerTester().curator())); Version version = Version.fromString("5.0"); tester.updateVersionStatus(version); @@ -663,6 +694,7 @@ public class UpgraderTest { // Application upgrade starts tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); tester.deployAndNotify(app, applicationPackage, true, systemTest); tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.stagingTest); tester.deployAndNotify(app, applicationPackage, true, productionUsWest1); @@ -675,14 +707,14 @@ public class UpgraderTest { version = Version.fromString("5.2"); tester.updateVersionStatus(version); tester.upgrader().maintain(); - readyJobsTrigger.maintain(); + tester.readyJobTrigger().maintain(); assertTrue("Nothing is scheduled", tester.deploymentQueue().jobs().isEmpty()); // Monday morning: We are not blocked tester.clock().advance(Duration.ofDays(1)); // Sunday, 17:00 tester.clock().advance(Duration.ofHours(17)); // Monday, 10:00 tester.upgrader().maintain(); - readyJobsTrigger.maintain(); + tester.readyJobTrigger().maintain(); // We proceed with the new version in the expected order, not starting with the previously blocked version: // Test jobs are run with the new version, but not production as we are in the block window tester.deployAndNotify(app, applicationPackage, true, systemTest); @@ -730,6 +762,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); // Canaries upgrade and raise confidence tester.completeUpgrade(canary0, version, canaryApplicationPackage); @@ -740,6 +773,7 @@ public class UpgraderTest { // Applications with default policy start upgrading tester.clock().advance(Duration.ofMinutes(1)); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Upgrade scheduled for remaining apps", 5, tester.deploymentQueue().jobs().size()); // 4/5 applications fail, confidence is lowered and upgrade is cancelled @@ -751,9 +785,9 @@ public class UpgraderTest { assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); - // Exhaust retries and finish runs - tester.clock().advance(Duration.ofHours(1)); + // Finish runs tester.jobCompletion(systemTest).application(default0).unsuccessful().submit(); tester.jobCompletion(systemTest).application(default1).unsuccessful().submit(); tester.jobCompletion(systemTest).application(default2).unsuccessful().submit(); @@ -780,6 +814,7 @@ public class UpgraderTest { assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); assertEquals("Upgrade scheduled for previously failing apps", 4, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(default0, version, defaultApplicationPackageV2); @@ -822,6 +857,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); upgrader.maintain(); + tester.readyJobTrigger().maintain(); assertEquals(2, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(canary0, version, "canary"); @@ -830,16 +866,19 @@ public class UpgraderTest { // Next run upgrades a subset upgrader.maintain(); + tester.readyJobTrigger().maintain(); assertEquals(2, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(default0, version, "default"); tester.completeUpgrade(default2, version, "default"); // Remaining applications upgraded upgrader.maintain(); + tester.readyJobTrigger().maintain(); assertEquals(2, tester.deploymentQueue().jobs().size()); tester.completeUpgrade(default1, version, "default"); tester.completeUpgrade(default3, version, "default"); upgrader.maintain(); + tester.readyJobTrigger().maintain(); assertTrue("All jobs consumed", tester.deploymentQueue().jobs().isEmpty()); } @@ -860,13 +899,10 @@ public class UpgraderTest { version = Version.fromString("5.1"); tester.updateVersionStatus(version); tester.upgrader().maintain(); + tester.readyJobTrigger().maintain(); tester.deployAndNotify(app, applicationPackage, true, systemTest); tester.deployAndNotify(app, applicationPackage, true, stagingTest); - - // Production job fails and exhausts retries, new application changes are now accepted - tester.deployAndNotify(app, applicationPackage, false, productionUsWest1); - tester.clock().advance(Duration.ofHours(1)); tester.deployAndNotify(app, applicationPackage, false, productionUsWest1); // New application change @@ -880,8 +916,9 @@ public class UpgraderTest { app.change().application().get().id().equals(applicationVersion)); // Deployment completes - tester.deployAndNotify(app, applicationPackage, true, systemTest); - tester.deployAndNotify(app, applicationPackage, true, stagingTest); + tester.deployAndNotify(app, applicationPackage, true, false, systemTest); + tester.deployAndNotify(app, applicationPackage, true, false, stagingTest); + tester.jobCompletion(productionUsWest1).application(app).unsuccessful().submit(); tester.deployAndNotify(app, applicationPackage, true, productionUsWest1); assertTrue("All jobs consumed", tester.deploymentQueue().jobs().isEmpty()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/ContainerControllerTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/ContainerControllerTester.java index b810c3adeb5..8e60e63e873 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/ContainerControllerTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/ContainerControllerTester.java @@ -126,6 +126,7 @@ public class ContainerControllerTester { throw new RuntimeException(e); } controller().applications().notifyJobCompletion(jobReport); + controller().applications().deploymentTrigger().triggerReadyJobs(); } private AthenzDomain addTenantAthenzDomain(String domainName, String userName) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java index 0ae9cf767d0..b9eef2069d9 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.restapi.application; +import com.google.common.base.Functions; import com.yahoo.application.container.handler.Request; import com.yahoo.application.container.handler.Response; import com.yahoo.component.Version; @@ -173,22 +174,6 @@ public class ApplicationApiTest extends ControllerContainerTest { addUserToHostedOperatorRole(HostedAthenzIdentities.from(HOSTED_VESPA_OPERATOR)); - // POST triggering of a full deployment to an application (if version is omitted, current system version is used) - tester.assertResponse(request("/application/v4/tenant/tenant1/application/application1/deploying", POST) - .userIdentity(HOSTED_VESPA_OPERATOR) - .data("6.1.0"), - new File("application-deployment.json")); - - // DELETE (cancel) ongoing change - tester.assertResponse(request("/application/v4/tenant/tenant1/application/application1/deploying", DELETE) - .userIdentity(HOSTED_VESPA_OPERATOR), - new File("application-deployment-cancelled.json")); - - // DELETE (cancel) again is a no-op - tester.assertResponse(request("/application/v4/tenant/tenant1/application/application1/deploying", DELETE) - .userIdentity(HOSTED_VESPA_OPERATOR), - new File("application-deployment-cancelled-no-op.json")); - // POST (deploy) an application to a zone - manual user deployment HttpEntity entity = createApplicationDeployData(applicationPackage, Optional.empty()); tester.assertResponse(request("/application/v4/tenant/tenant1/application/application1/environment/dev/region/us-west-1/instance/default/deploy", POST) @@ -285,6 +270,21 @@ public class ApplicationApiTest extends ControllerContainerTest { .recursive("true"), new File("application1-recursive.json")); + // DELETE (cancel) ongoing change + tester.assertResponse(request("/application/v4/tenant/tenant1/application/application1/deploying", DELETE) + .userIdentity(HOSTED_VESPA_OPERATOR), + new File("application-deployment-cancelled.json")); + + // DELETE (cancel) again is a no-op + tester.assertResponse(request("/application/v4/tenant/tenant1/application/application1/deploying", DELETE) + .userIdentity(HOSTED_VESPA_OPERATOR), + new File("application-deployment-cancelled-no-op.json")); + + // POST triggering of a full deployment to an application (if version is omitted, current system version is used) + tester.assertResponse(request("/application/v4/tenant/tenant1/application/application1/deploying", POST) + .userIdentity(HOSTED_VESPA_OPERATOR) + .data("6.1.0"), + new File("application-deployment.json")); // POST a 'restart application' command tester.assertResponse(request("/application/v4/tenant/tenant1/application/application1/environment/prod/region/corp-us-east-1/instance/default/restart", POST) @@ -792,7 +792,7 @@ public class ApplicationApiTest extends ControllerContainerTest { Version vespaVersion = new Version("6.1"); // system version from mock config server client - BuildJob job = new BuildJob(this::notifyCompletion, tester.artifactRepository()) + BuildJob job = new BuildJob(report -> notifyCompletion(report, tester), tester.artifactRepository()) .application(app) .projectId(projectId); job.type(DeploymentJobs.JobType.component).uploadArtifact(applicationPackage).submit(); @@ -846,7 +846,7 @@ public class ApplicationApiTest extends ControllerContainerTest { .build(); // Report job failing with out of capacity - BuildJob job = new BuildJob(this::notifyCompletion, tester.artifactRepository()) + BuildJob job = new BuildJob(report -> notifyCompletion(report, tester), tester.artifactRepository()) .application(app) .projectId(projectId); job.type(DeploymentJobs.JobType.component).uploadArtifact(applicationPackage).submit(); @@ -866,12 +866,13 @@ public class ApplicationApiTest extends ControllerContainerTest { assertEquals(DeploymentJobs.JobError.outOfCapacity, jobStatus.jobError().get()); } - private void notifyCompletion(DeploymentJobs.JobReport report) { + private void notifyCompletion(DeploymentJobs.JobReport report, ContainerControllerTester tester) { assertResponse(request("/application/v4/tenant/tenant1/application/application1/jobreport", POST) .userIdentity(HOSTED_VESPA_OPERATOR) .data(asJson(report)) .get(), 200, "{\"message\":\"ok\"}"); + tester.controller().applications().deploymentTrigger().triggerReadyJobs(); } private static byte[] asJson(DeploymentJobs.JobReport report) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application-deployment-cancelled.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application-deployment-cancelled.json index 3b6d8ed71e9..bc09003d86f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application-deployment-cancelled.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application-deployment-cancelled.json @@ -1 +1 @@ -{"message":"Cancelled upgrade to 6.1 for application 'tenant1.application1'"} +{"message":"Cancelled application change to 1.0.42-commit1 for application 'tenant1.application1'"} diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiTest.java index 4de3b9abd5b..743baf76759 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiTest.java @@ -69,6 +69,7 @@ public class DeploymentApiTest extends ControllerContainerTest { // Applications upgrade, 1/2 succeed tester.upgrader().maintain(); + tester.controller().applications().deploymentTrigger().triggerReadyJobs(); deployCompletely(failingApplication, applicationPackage, projectId, false); deployCompletely(productionApplication, applicationPackage, projectId, true); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java index 27e26e3267a..14f5d00ec88 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java @@ -156,12 +156,6 @@ public class VersionStatusTest { assertEquals("One canary failed: Broken", Confidence.broken, confidence(tester.controller(), version1)); - // Finish running jobs - tester.deployAndNotify(canary2, DeploymentTester.applicationPackage("canary"), false, systemTest); - tester.clock().advance(Duration.ofHours(1)); - tester.deployAndNotify(canary1, DeploymentTester.applicationPackage("canary"), false, productionUsWest1); - tester.deployAndNotify(canary2, DeploymentTester.applicationPackage("canary"), false, systemTest); - // New version is released Version version2 = new Version("5.2"); tester.upgradeSystem(version2); @@ -170,6 +164,7 @@ public class VersionStatusTest { // All canaries upgrade successfully tester.completeUpgrade(canary0, version2, "canary"); + tester.jobCompletion(productionUsWest1).application(canary1).unsuccessful().submit(); tester.completeUpgrade(canary1, version2, "canary"); assertEquals("Confidence for remains unchanged for version1: Broken", @@ -178,6 +173,7 @@ public class VersionStatusTest { Confidence.low, confidence(tester.controller(), version2)); // Remaining canary upgrades to version2 which raises confidence to normal and more apps upgrade + tester.jobCompletion(systemTest).application(canary2).unsuccessful().submit(); tester.completeUpgrade(canary2, version2, "canary"); tester.upgradeSystem(version2); assertEquals("Canaries have upgraded: Normal", |