From b1511cf1c5f4237a67d72151b10b5a0b9649c662 Mon Sep 17 00:00:00 2001 From: Jon Marius Venstad Date: Mon, 26 Mar 2018 15:37:11 +0200 Subject: Immediate retry triggers through trigger-ready-jobs --- .../controller/deployment/DeploymentTrigger.java | 33 +++++++++------------- .../vespa/hosted/controller/ControllerTest.java | 2 +- .../restapi/application/responses/application.json | 18 ++++++++++++ .../responses/application1-recursive.json | 18 ++++++++++++ 4 files changed, 51 insertions(+), 20 deletions(-) (limited to 'controller-server/src') diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 54deb355836..cf4071b9035 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.logging.Logger; +import java.util.stream.Stream; /** * Responsible for scheduling deployment jobs in a build system and keeping @@ -114,8 +115,10 @@ public class DeploymentTrigger { } else if (retryBecauseOutOfCapacity(application, report.jobType())) application = trigger(new Triggering(application, report.jobType(), true, "Retrying on out of capacity"), Collections.emptySet(), false); - else if (retryBecauseNewFailure(application, report.jobType())) - application = trigger(new Triggering(application, report.jobType(), false, "Immediate retry on failure"), Collections.emptySet(), false); + else if (retryBecauseNewFailure(application, report.jobType())) { + triggerReadyJobs(application); + return; // Don't overwrite below. + } applications().store(application); }); @@ -134,6 +137,7 @@ public class DeploymentTrigger { /** Find the next step to trigger if any, and triggers it */ public void triggerReadyJobs(LockedApplication application) { if ( ! application.change().isPresent()) return; + List jobs = order.jobsFrom(application.deploymentSpec()); // Should the first step be triggered? @@ -147,23 +151,15 @@ public class DeploymentTrigger { || ! systemTestStatus.lastTriggered().get().version().equals(target) || systemTestStatus.isHanging(jobTimeoutLimit())) { application = trigger(new Triggering(application, JobType.systemTest, false, "Upgrade to " + target), Collections.emptySet(), false); - controller.applications().store(application); - } - } - else { - JobStatus componentStatus = application.deploymentJobs().jobStatus().get(JobType.component); - if (componentStatus != null && changesAvailable(application, componentStatus, systemTestStatus)) { - application = trigger(new Triggering(application, JobType.systemTest, false, "Available change in component"), Collections.emptySet(), false); - controller.applications().store(application); + applications().store(application); } } } // Find next steps to trigger based on the state of the previous step - for (JobType jobType : jobs) { + for (JobType jobType : (Iterable) Stream.concat(Stream.of(JobType.component), jobs.stream())::iterator) { JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); if (jobStatus == null) continue; // job has never run - if (jobStatus.isRunning(jobTimeoutLimit())) continue; // Collect the subset of next jobs which have not run with the last changes // TODO jvenstad: Change to be step-centric. @@ -173,7 +169,7 @@ public class DeploymentTrigger { if (changesAvailable(application, jobStatus, nextStatus) || nextStatus.isHanging(jobTimeoutLimit())) application = trigger(new Triggering(application, nextJobType, false, "Available change in " + jobType.jobName()), nextJobs, false); } - controller.applications().store(application); + applications().store(application); } } @@ -185,7 +181,6 @@ public class DeploymentTrigger { * @param force true to disable checks which should normally prevent this triggering from happening * @return the application in the triggered state, if actually triggered. This *must* be stored by the caller */ - // TODO jvenstad: Replace with (Collection concurrentlyWith) to allow, e.g., concurrent retries. public LockedApplication trigger(Triggering triggering, Collection concurrentlyWith, boolean force) { if (triggering.jobType == null) return triggering.application; // we are passed null when the last job has been reached @@ -209,11 +204,11 @@ public class DeploymentTrigger { deploymentQueue.addJob(triggering.application.id(), triggering.jobType, triggering.retry); // TODO jvenstad: Let triggering set only time of triggering (and reason, for debugging?) when build system is polled for job status. return triggering.application.withJobTriggering(triggering.jobType, - clock.instant(), - triggering.application.deployVersionFor(triggering.jobType, controller), - triggering.application.deployApplicationVersionFor(triggering.jobType, controller, false) - .orElse(ApplicationVersion.unknown), - triggering.reason); + clock.instant(), + triggering.application.deployVersionFor(triggering.jobType, controller), + triggering.application.deployApplicationVersionFor(triggering.jobType, controller, false) + .orElse(ApplicationVersion.unknown), + triggering.reason); } /** diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java index bba82ce2980..594a9c0da44 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java @@ -392,7 +392,7 @@ public class ControllerTest { // Initial failure tester.clock().advance(Duration.ofMillis(1000)); initialFailure = tester.clock().instant(); - tester.jobCompletion(component).application(app).submit(); + tester.jobCompletion(component).application(app).nextBuildNumber().uploadArtifact(applicationPackage).submit(); tester.deployAndNotify(app, applicationPackage, false, systemTest); assertEquals("Failure age is right at initial failure", initialFailure.plus(Duration.ofMillis(2)), firstFailing(app, tester).get().at()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json index 3d8d8a4b34d..f75f24aa74b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json @@ -181,6 +181,24 @@ "reason": "staging-test completed", "at": "(ignore)" } + }, + { + "type": "production-us-east-3", + "success": false, + "lastTriggered": { + "id": -1, + "version": "(ignore)", + "revision": { + "hash": "(ignore)", + "source": { + "gitRepository": "repository1", + "gitBranch": "master", + "gitCommit": "commit1" + } + }, + "reason": "Available change in production-corp-us-east-1", + "at": "(ignore)" + } } ], "changeBlockers": [ diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json index 16bd5c2e720..3109cce5731 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json @@ -181,6 +181,24 @@ "reason": "staging-test completed", "at": "(ignore)" } + }, + { + "type": "production-us-east-3", + "success": false, + "lastTriggered": { + "id": -1, + "version": "(ignore)", + "revision": { + "hash": "(ignore)", + "source": { + "gitRepository": "repository1", + "gitBranch": "master", + "gitCommit": "commit1" + } + }, + "reason": "Available change in production-corp-us-east-1", + "at": "(ignore)" + } } ], "changeBlockers": [ -- cgit v1.2.3