diff options
Diffstat (limited to 'controller-server/src/main/java/com/yahoo/vespa')
2 files changed, 49 insertions, 79 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java index 5823dd160c0..1d3fff57a78 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java @@ -553,6 +553,11 @@ public class ApplicationController { } public void notifyJobCompletion(JobReport report) { + log.log(Level.INFO, String.format("Notified of %s of %s %d for '%s'.", + report.jobError().map(error -> error + " failure").orElse("success"), + report.jobType(), + report.buildNumber(), + report.applicationId())); if ( ! get(report.applicationId()).isPresent()) { log.log(Level.WARNING, "Ignoring completion of job of project '" + report.projectId() + "': Unknown application '" + report.applicationId() + "'"); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 767ffbaa7ea..f6f65df56b7 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -87,17 +87,12 @@ public class DeploymentTrigger { // Handle successful starting and ending if (report.jobType() == JobType.component) { if (report.success()) { - if ( ! acceptNewApplicationVersionNow(application)) { - applications().store(application.withOutstandingChange(Change.of(applicationVersion))); - return; - } - // Note that in case of an ongoing upgrade this may result in both the upgrade and application - // change being deployed together - application = application.withChange(application.change().with(applicationVersion)); - } - else { // don't re-trigger component on failure - applications().store(application); - return; + if ( ! acceptNewApplicationVersionNow(application)) + application = application.withOutstandingChange(Change.of(applicationVersion)); + else + // Note that in case of an ongoing upgrade this may result in both the upgrade and application + // change being deployed together + application = application.withChange(application.change().with(applicationVersion)); } } else if (report.jobType().isProduction() && deploymentComplete(application)) { @@ -106,21 +101,6 @@ public class DeploymentTrigger { application = application.withChange(Change.empty()); } - // TODO jvenstad: Don't trigger. - // Trigger next - if (report.success()) { - triggerReadyJobs(application); - return; // Don't overwrite below. - } - else if (retryBecauseOutOfCapacity(application, report.jobType())) { - triggerReadyJobs(application); - return; // Don't overwrite below. - } - else if (retryBecauseNewFailure(application, report.jobType())) { - triggerReadyJobs(application); - return; // Don't overwrite below. - } - applications().store(application); }); } @@ -135,45 +115,6 @@ public class DeploymentTrigger { applications().lockIfPresent(application.id(), this::triggerReadyJobs); } - /** Find the next step to trigger if any, and triggers it */ - public void triggerReadyJobs(LockedApplication application) { - List<JobType> jobs = order.jobsFrom(application.deploymentSpec()); - - // Should the first step be triggered? - if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) ) { - JobStatus systemTestStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest); - if (application.change().platform().isPresent()) { - Version target = application.change().platform().get(); - if (systemTestStatus == null - || ! systemTestStatus.lastTriggered().isPresent() - || ! systemTestStatus.isSuccess() - || ! systemTestStatus.lastTriggered().get().version().equals(target) - || systemTestStatus.isHanging(jobTimeoutLimit())) { - application = trigger(new Triggering(application, JobType.systemTest, false, "Upgrade to " + target), Collections.emptySet(), false); - applications().store(application); - } - } - } - - // Find next steps to trigger based on the state of the previous step - for (JobType jobType : (Iterable<JobType>) Stream.concat(Stream.of(JobType.component), jobs.stream())::iterator) { - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - if (jobStatus == null) continue; // job has never run - - // Collect the subset of next jobs which have not run with the last changes - // TODO jvenstad: Change to be step-centric. - List<JobType> nextJobs = order.nextAfter(jobType, application); - for (JobType nextJobType : nextJobs) { - JobStatus nextStatus = application.deploymentJobs().jobStatus().get(nextJobType); - if (changesAvailable(application, jobStatus, nextStatus) || nextStatus.isHanging(jobTimeoutLimit())) { - boolean isRetry = nextStatus != null && nextStatus.jobError().filter(JobError.outOfCapacity::equals).isPresent(); - application = trigger(new Triggering(application, nextJobType, isRetry, isRetry ? "Retrying on out of capacity" : "Available change in " + jobType.jobName()), nextJobs, false); - } - } - applications().store(application); - } - } - /** * Trigger a job for an application, if allowed * @@ -226,8 +167,7 @@ public class DeploymentTrigger { application = application.withChange(change); if (change.application().isPresent()) application = application.withOutstandingChange(Change.empty()); - // TODO jvenstad: Don't trigger. - application = trigger(new Triggering(application, JobType.systemTest, false, change.toString()), Collections.emptySet(), false); + applications().store(application); }); } @@ -250,22 +190,47 @@ public class DeploymentTrigger { //--- End of methods which triggers deployment jobs ---------------------------- - private ApplicationController applications() { return controller.applications(); } + /** Find the next step to trigger if any, and triggers it */ + private void triggerReadyJobs(LockedApplication application) { + List<JobType> jobs = order.jobsFrom(application.deploymentSpec()); - /** Retry immediately only if this job just started failing. Otherwise retry periodically */ - private boolean retryBecauseNewFailure(Application application, JobType jobType) { - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - return (jobStatus != null && jobStatus.firstFailing().get().at().isAfter(clock.instant().minus(Duration.ofSeconds(10)))); - } + // Should the first step be triggered? + if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) ) { + JobStatus systemTestStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest); + if (application.change().platform().isPresent()) { + Version target = application.change().platform().get(); + if (systemTestStatus == null + || ! systemTestStatus.lastTriggered().isPresent() + || ! systemTestStatus.isSuccess() + || ! systemTestStatus.lastTriggered().get().version().equals(target) + || systemTestStatus.isHanging(jobTimeoutLimit())) { + application = trigger(new Triggering(application, JobType.systemTest, false, "Upgrade to " + target), Collections.emptySet(), false); + applications().store(application); + } + } + } + + // Find next steps to trigger based on the state of the previous step + for (JobType jobType : (Iterable<JobType>) Stream.concat(Stream.of(JobType.component), jobs.stream())::iterator) { + JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); + if (jobStatus == null) continue; // job has never run - /** Decide whether to retry due to capacity restrictions */ - private boolean retryBecauseOutOfCapacity(Application application, JobType jobType) { - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - if (jobStatus == null || ! jobStatus.jobError().equals(Optional.of(JobError.outOfCapacity))) return false; - // Retry the job if it failed recently - return jobStatus.firstFailing().get().at().isAfter(clock.instant().minus(Duration.ofMinutes(15))); + // Collect the subset of next jobs which have not run with the last changes + // TODO jvenstad: Change to be step-centric. + List<JobType> nextJobs = order.nextAfter(jobType, application); + for (JobType nextJobType : nextJobs) { + JobStatus nextStatus = application.deploymentJobs().jobStatus().get(nextJobType); + if (changesAvailable(application, jobStatus, nextStatus) || nextStatus.isHanging(jobTimeoutLimit())) { + boolean isRetry = nextStatus != null && nextStatus.jobError().filter(JobError.outOfCapacity::equals).isPresent(); + application = trigger(new Triggering(application, nextJobType, isRetry, isRetry ? "Retrying on out of capacity" : "Available change in " + jobType.jobName()), nextJobs, false); + } + } + applications().store(application); + } } + private ApplicationController applications() { return controller.applications(); } + /** Returns whether the given job type should be triggered according to deployment spec */ private boolean hasJob(JobType jobType, Application application) { if ( ! jobType.isProduction()) return true; // Deployment spec only determines this for production jobs. |