diff options
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
4 files changed, 62 insertions, 53 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java index 3fcd285e0fc..fa7a48c85c2 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java @@ -80,21 +80,11 @@ public class ApplicationList { return listOf(list.stream().filter(application -> ! failingOn(version, application))); } - /** Returns the subset of applications which have one or more deployment jobs failing for the current change */ - public ApplicationList hasDeploymentFailures() { - return listOf(list.stream().filter(application -> application.deploying().isPresent() && application.deploymentJobs().failingOn(application.deploying().get()))); - } - /** Returns the subset of applications which have at least one deployment */ public ApplicationList hasDeployment() { return listOf(list.stream().filter(a -> !a.deployments().isEmpty())); } - /** Returns the subset of applications that are currently deploying a change */ - public ApplicationList isDeploying() { - return listOf(list.stream().filter(application -> application.deploying().isPresent())); - } - /** Returns the subset of applications which started failing after the given instant */ public ApplicationList startedFailingAfter(Instant instant) { return listOf(list.stream().filter(application -> application.deploymentJobs().failingSince().isAfter(instant))); @@ -140,18 +130,6 @@ public class ApplicationList { return listOf(list.stream().filter(a -> !hasRunningJob(a, change))); } - /** Returns the subset of applications which currently do not have any job in progress */ - public ApplicationList notRunningJob() { - return listOf(list.stream().filter(a -> !a.deploymentJobs().inProgress())); - } - - /** Returns the subset of applications which has a job that started running before the given instant */ - public ApplicationList jobRunningSince(Instant instant) { - return listOf(list.stream().filter(a -> a.deploymentJobs().runningSince() - .map(at -> at.isBefore(instant)) - .orElse(false))); - } - /** Returns the subset of applications which deploys to given environment and region */ public ApplicationList deploysTo(Environment environment, RegionName region) { return listOf(list.stream().filter(a -> a.deploymentSpec().includes(environment, Optional.of(region)))); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java index d9256f94086..d775dd2a356 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java @@ -14,7 +14,6 @@ import com.yahoo.vespa.hosted.controller.Controller; import java.time.Instant; import java.util.Collection; import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; @@ -118,11 +117,6 @@ public class DeploymentJobs { return status.values().stream().anyMatch(JobStatus::inProgress); } - /** Returns whether any job is failing for the given change */ - public boolean failingOn(Change change) { - return status.values().stream().anyMatch(jobStatus -> !jobStatus.isSuccess() && jobStatus.lastCompletedFor(change)); - } - /** Returns whether change can be deployed to the given environment */ public boolean isDeployableTo(Environment environment, Optional<Change> change) { if (environment == null || !change.isPresent()) { @@ -147,15 +141,6 @@ public class DeploymentJobs { return failingSince; } - /** Returns the time at which the oldest running job started */ - public Optional<Instant> runningSince() { - return jobStatus().values().stream() - .filter(JobStatus::inProgress) - .sorted(Comparator.comparing(jobStatus -> jobStatus.lastTriggered().get().at())) - .map(jobStatus -> jobStatus.lastTriggered().get().at()) - .findFirst(); - } - /** * Returns the id of the Screwdriver project running these deployment jobs * - or empty when this is not known or does not exist. diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 2bc219dde62..ac84f3685ca 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -89,13 +89,13 @@ public class DeploymentTrigger { /** * Called periodically to cause triggering of jobs in the background */ - public void triggerFailing(ApplicationId applicationId) { + public void triggerFailing(ApplicationId applicationId, String cause) { try (Lock lock = applications().lock(applicationId)) { Application application = applications().require(applicationId); if (shouldRetryFromBeginning(application)) { // failed for a long time: Discard existing change and restart from the component job application = application.withDeploying(Optional.empty()); - application = trigger(JobType.component, application, "Retrying failing deployment from beginning", lock); + application = trigger(JobType.component, application, "Retrying failing deployment from beginning: " + cause, lock); applications().store(application, lock); } else { // retry the failed job (with backoff) @@ -103,7 +103,7 @@ public class DeploymentTrigger { JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); if (isFailing(jobStatus)) { if (shouldRetryNow(jobStatus)) { - application = trigger(jobType, application, "Retrying failing job", lock); + application = trigger(jobType, application, "Retrying failing job: " + cause, lock); applications().store(application, lock); } break; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java index 9e8f902a8db..38d4a4a8a81 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java @@ -3,12 +3,15 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; -import com.yahoo.vespa.hosted.controller.application.ApplicationList; +import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType; +import com.yahoo.vespa.hosted.controller.application.JobStatus; import java.time.Duration; import java.time.Instant; -import java.util.ArrayList; +import java.util.Comparator; import java.util.List; +import java.util.Map; +import java.util.Optional; /** * Attempts redeployment of failed jobs and deployments. @@ -16,6 +19,8 @@ import java.util.List; * @author bratseth */ public class FailureRedeployer extends Maintainer { + + private final static Duration jobTimeout = Duration.ofHours(12); public FailureRedeployer(Controller controller, Duration interval, JobControl jobControl) { super(controller, interval, jobControl); @@ -23,20 +28,61 @@ public class FailureRedeployer extends Maintainer { @Override public void maintain() { - ApplicationList applications = ApplicationList.from(controller().applications().asList()).isDeploying(); - List<Application> toTrigger = new ArrayList<>(); + List<Application> applications = controller().applications().asList(); + retryFailingJobs(applications); + retryStuckJobs(applications); + } + + private void retryFailingJobs(List<Application> applications) { + for (Application application : applications) { + if (!application.deploying().isPresent()) { + continue; + } + if (application.deploymentJobs().inProgress()) { + continue; + } + Optional<Map.Entry<JobType, JobStatus>> failingJob = jobFailingFor(application); + failingJob.ifPresent(job -> triggerFailing(application, "Job " + job.getKey().id() + + " has been failing since " + job.getValue().lastCompleted().get())); + } + } - // Applications with deployment failures for current change and no running jobs - toTrigger.addAll(applications.hasDeploymentFailures() - .notRunningJob() - .asList()); + private void retryStuckJobs(List<Application> applications) { + Instant maxAge = controller().clock().instant().minus(jobTimeout); + for (Application application : applications) { + if (!application.deploying().isPresent()) { + continue; + } + Optional<Map.Entry<JobType, JobStatus>> job = oldestRunningJob(application); + if (!job.isPresent()) { + continue; + } + // Ignore job if it doesn't belong to a zone in this system + if (!job.get().getKey().zone(controller().system()).isPresent()) { + continue; + } + if (job.get().getValue().lastTriggered().get().at().isBefore(maxAge)) { + triggerFailing(application, "Job " + job.get().getKey().id() + + " has been running for more than " + jobTimeout); + } + } + } + + private Optional<Map.Entry<JobType, JobStatus>> jobFailingFor(Application application) { + return application.deploymentJobs().jobStatus().entrySet().stream() + .filter(e -> !e.getValue().isSuccess() && e.getValue().lastCompletedFor(application.deploying().get())) + .findFirst(); + } - // Applications with jobs that have been in progress for more than 12 hours - Instant twelveHoursAgo = controller().clock().instant().minus(Duration.ofHours(12)); - toTrigger.addAll(applications.jobRunningSince(twelveHoursAgo).asList()); + private Optional<Map.Entry<JobType, JobStatus>> oldestRunningJob(Application application) { + return application.deploymentJobs().jobStatus().entrySet().stream() + .filter(kv -> kv.getValue().inProgress()) + .sorted(Comparator.comparing(kv -> kv.getValue().lastTriggered().get().at())) + .findFirst(); + } - toTrigger.forEach(application -> controller().applications().deploymentTrigger() - .triggerFailing(application.id())); + private void triggerFailing(Application application, String cause) { + controller().applications().deploymentTrigger().triggerFailing(application.id(), cause); } } |