summaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
diff options
context:
space:
mode:
Diffstat (limited to 'controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java56
1 files changed, 3 insertions, 53 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
index 43c1755af34..c8831bd6a56 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
@@ -4,13 +4,9 @@ package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
-import com.yahoo.vespa.hosted.controller.application.JobStatus;
import java.time.Duration;
-import java.time.Instant;
-import java.util.Comparator;
import java.util.List;
-import java.util.Optional;
/**
* Attempts redeployment of failed jobs and deployments.
@@ -31,57 +27,11 @@ public class FailureRedeployer extends Maintainer {
List<Application> applications = ApplicationList.from(controller().applications().asList())
.notPullRequest()
.asList();
- retryFailingJobs(applications);
- retryStuckJobs(applications);
+ applications.forEach(application -> triggerFailing(application, jobTimeout));
}
- private void retryFailingJobs(List<Application> applications) {
- for (Application application : applications) {
- if (!application.deploying().isPresent()) {
- continue;
- }
- if (application.deploymentJobs().inProgress()) {
- continue;
- }
- Optional<JobStatus> failingJob = jobFailingFor(application);
- failingJob.ifPresent(job -> triggerFailing(application, "Job " + job.type().id() +
- " has been failing since " + job.firstFailing().get()));
- }
- }
-
- private void retryStuckJobs(List<Application> applications) {
- Instant startOfGracePeriod = controller().clock().instant().minus(jobTimeout);
- for (Application application : applications) {
- Optional<JobStatus> job = oldestRunningJob(application);
- if (!job.isPresent()) {
- continue;
- }
- // Ignore job if it doesn't belong to a zone in this system
- if (!job.get().type().zone(controller().system()).isPresent()) {
- continue;
- }
- if (job.get().lastTriggered().get().at().isBefore(startOfGracePeriod)) {
- triggerFailing(application, "Job " + job.get().type().id() +
- " has been running for more than " + jobTimeout);
- }
- }
- }
-
- private Optional<JobStatus> jobFailingFor(Application application) {
- return application.deploymentJobs().jobStatus().values().stream()
- .filter(status -> !status.isSuccess() && status.lastCompletedFor(application.deploying().get()))
- .findFirst();
- }
-
- private Optional<JobStatus> oldestRunningJob(Application application) {
- return application.deploymentJobs().jobStatus().values().stream()
- .filter(JobStatus::inProgress)
- .sorted(Comparator.comparing(status -> status.lastTriggered().get().at()))
- .findFirst();
- }
-
- private void triggerFailing(Application application, String cause) {
- controller().applications().deploymentTrigger().triggerFailing(application.id(), cause);
+ private void triggerFailing(Application application, Duration timeout) {
+ controller().applications().deploymentTrigger().triggerFailing(application.id(), timeout);
}
}