summaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com/yahoo
diff options
context:
space:
mode:
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java22
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java15
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java72
4 files changed, 62 insertions, 53 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
index 3fcd285e0fc..fa7a48c85c2 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
@@ -80,21 +80,11 @@ public class ApplicationList {
return listOf(list.stream().filter(application -> ! failingOn(version, application)));
}
- /** Returns the subset of applications which have one or more deployment jobs failing for the current change */
- public ApplicationList hasDeploymentFailures() {
- return listOf(list.stream().filter(application -> application.deploying().isPresent() && application.deploymentJobs().failingOn(application.deploying().get())));
- }
-
/** Returns the subset of applications which have at least one deployment */
public ApplicationList hasDeployment() {
return listOf(list.stream().filter(a -> !a.deployments().isEmpty()));
}
- /** Returns the subset of applications that are currently deploying a change */
- public ApplicationList isDeploying() {
- return listOf(list.stream().filter(application -> application.deploying().isPresent()));
- }
-
/** Returns the subset of applications which started failing after the given instant */
public ApplicationList startedFailingAfter(Instant instant) {
return listOf(list.stream().filter(application -> application.deploymentJobs().failingSince().isAfter(instant)));
@@ -140,18 +130,6 @@ public class ApplicationList {
return listOf(list.stream().filter(a -> !hasRunningJob(a, change)));
}
- /** Returns the subset of applications which currently do not have any job in progress */
- public ApplicationList notRunningJob() {
- return listOf(list.stream().filter(a -> !a.deploymentJobs().inProgress()));
- }
-
- /** Returns the subset of applications which has a job that started running before the given instant */
- public ApplicationList jobRunningSince(Instant instant) {
- return listOf(list.stream().filter(a -> a.deploymentJobs().runningSince()
- .map(at -> at.isBefore(instant))
- .orElse(false)));
- }
-
/** Returns the subset of applications which deploys to given environment and region */
public ApplicationList deploysTo(Environment environment, RegionName region) {
return listOf(list.stream().filter(a -> a.deploymentSpec().includes(environment, Optional.of(region))));
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java
index d9256f94086..d775dd2a356 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java
@@ -14,7 +14,6 @@ import com.yahoo.vespa.hosted.controller.Controller;
import java.time.Instant;
import java.util.Collection;
import java.util.Collections;
-import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
@@ -118,11 +117,6 @@ public class DeploymentJobs {
return status.values().stream().anyMatch(JobStatus::inProgress);
}
- /** Returns whether any job is failing for the given change */
- public boolean failingOn(Change change) {
- return status.values().stream().anyMatch(jobStatus -> !jobStatus.isSuccess() && jobStatus.lastCompletedFor(change));
- }
-
/** Returns whether change can be deployed to the given environment */
public boolean isDeployableTo(Environment environment, Optional<Change> change) {
if (environment == null || !change.isPresent()) {
@@ -147,15 +141,6 @@ public class DeploymentJobs {
return failingSince;
}
- /** Returns the time at which the oldest running job started */
- public Optional<Instant> runningSince() {
- return jobStatus().values().stream()
- .filter(JobStatus::inProgress)
- .sorted(Comparator.comparing(jobStatus -> jobStatus.lastTriggered().get().at()))
- .map(jobStatus -> jobStatus.lastTriggered().get().at())
- .findFirst();
- }
-
/**
* Returns the id of the Screwdriver project running these deployment jobs
* - or empty when this is not known or does not exist.
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index 2bc219dde62..ac84f3685ca 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -89,13 +89,13 @@ public class DeploymentTrigger {
/**
* Called periodically to cause triggering of jobs in the background
*/
- public void triggerFailing(ApplicationId applicationId) {
+ public void triggerFailing(ApplicationId applicationId, String cause) {
try (Lock lock = applications().lock(applicationId)) {
Application application = applications().require(applicationId);
if (shouldRetryFromBeginning(application)) {
// failed for a long time: Discard existing change and restart from the component job
application = application.withDeploying(Optional.empty());
- application = trigger(JobType.component, application, "Retrying failing deployment from beginning", lock);
+ application = trigger(JobType.component, application, "Retrying failing deployment from beginning: " + cause, lock);
applications().store(application, lock);
} else {
// retry the failed job (with backoff)
@@ -103,7 +103,7 @@ public class DeploymentTrigger {
JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType);
if (isFailing(jobStatus)) {
if (shouldRetryNow(jobStatus)) {
- application = trigger(jobType, application, "Retrying failing job", lock);
+ application = trigger(jobType, application, "Retrying failing job: " + cause, lock);
applications().store(application, lock);
}
break;
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
index 9e8f902a8db..38d4a4a8a81 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
@@ -3,12 +3,15 @@ package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
-import com.yahoo.vespa.hosted.controller.application.ApplicationList;
+import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType;
+import com.yahoo.vespa.hosted.controller.application.JobStatus;
import java.time.Duration;
import java.time.Instant;
-import java.util.ArrayList;
+import java.util.Comparator;
import java.util.List;
+import java.util.Map;
+import java.util.Optional;
/**
* Attempts redeployment of failed jobs and deployments.
@@ -16,6 +19,8 @@ import java.util.List;
* @author bratseth
*/
public class FailureRedeployer extends Maintainer {
+
+ private final static Duration jobTimeout = Duration.ofHours(12);
public FailureRedeployer(Controller controller, Duration interval, JobControl jobControl) {
super(controller, interval, jobControl);
@@ -23,20 +28,61 @@ public class FailureRedeployer extends Maintainer {
@Override
public void maintain() {
- ApplicationList applications = ApplicationList.from(controller().applications().asList()).isDeploying();
- List<Application> toTrigger = new ArrayList<>();
+ List<Application> applications = controller().applications().asList();
+ retryFailingJobs(applications);
+ retryStuckJobs(applications);
+ }
+
+ private void retryFailingJobs(List<Application> applications) {
+ for (Application application : applications) {
+ if (!application.deploying().isPresent()) {
+ continue;
+ }
+ if (application.deploymentJobs().inProgress()) {
+ continue;
+ }
+ Optional<Map.Entry<JobType, JobStatus>> failingJob = jobFailingFor(application);
+ failingJob.ifPresent(job -> triggerFailing(application, "Job " + job.getKey().id() +
+ " has been failing since " + job.getValue().lastCompleted().get()));
+ }
+ }
- // Applications with deployment failures for current change and no running jobs
- toTrigger.addAll(applications.hasDeploymentFailures()
- .notRunningJob()
- .asList());
+ private void retryStuckJobs(List<Application> applications) {
+ Instant maxAge = controller().clock().instant().minus(jobTimeout);
+ for (Application application : applications) {
+ if (!application.deploying().isPresent()) {
+ continue;
+ }
+ Optional<Map.Entry<JobType, JobStatus>> job = oldestRunningJob(application);
+ if (!job.isPresent()) {
+ continue;
+ }
+ // Ignore job if it doesn't belong to a zone in this system
+ if (!job.get().getKey().zone(controller().system()).isPresent()) {
+ continue;
+ }
+ if (job.get().getValue().lastTriggered().get().at().isBefore(maxAge)) {
+ triggerFailing(application, "Job " + job.get().getKey().id() +
+ " has been running for more than " + jobTimeout);
+ }
+ }
+ }
+
+ private Optional<Map.Entry<JobType, JobStatus>> jobFailingFor(Application application) {
+ return application.deploymentJobs().jobStatus().entrySet().stream()
+ .filter(e -> !e.getValue().isSuccess() && e.getValue().lastCompletedFor(application.deploying().get()))
+ .findFirst();
+ }
- // Applications with jobs that have been in progress for more than 12 hours
- Instant twelveHoursAgo = controller().clock().instant().minus(Duration.ofHours(12));
- toTrigger.addAll(applications.jobRunningSince(twelveHoursAgo).asList());
+ private Optional<Map.Entry<JobType, JobStatus>> oldestRunningJob(Application application) {
+ return application.deploymentJobs().jobStatus().entrySet().stream()
+ .filter(kv -> kv.getValue().inProgress())
+ .sorted(Comparator.comparing(kv -> kv.getValue().lastTriggered().get().at()))
+ .findFirst();
+ }
- toTrigger.forEach(application -> controller().applications().deploymentTrigger()
- .triggerFailing(application.id()));
+ private void triggerFailing(Application application, String cause) {
+ controller().applications().deploymentTrigger().triggerFailing(application.id(), cause);
}
}