summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2017-12-18 16:22:58 +0100
committerJon Bratseth <bratseth@yahoo-inc.com>2017-12-18 16:22:58 +0100
commit69152e64fc8e1091ffde2493fc91e7e71371200e (patch)
tree3f43e53da6a47ff888b83ec6a2eeb31839750d6d /controller-server
parent8aa5c3fa34ae7dcc152b4709343542fd499f21d0 (diff)
Always retry hanging jobs
Hanging jobs will usually be implicitly retried, since they will not be taken as running. However, they will not be retried if the outcome of completing the job will be unchanged. Such jobs still need to complete, to trigger the next step, such that we can eventually trigger the downstream jobs which need to run to complete the ongoing change (arguably we could skip the job in these cases, but currently we don't).
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java43
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java33
2 files changed, 41 insertions, 35 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java
index ceb04d88026..a7940076277 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java
@@ -11,14 +11,14 @@ import java.util.Optional;
/**
* The last known build status of a particular deployment job for a particular application.
* This is immutable.
- *
+ *
* @author bratseth
* @author mpolden
*/
public class JobStatus {
-
+
private final DeploymentJobs.JobType type;
-
+
private final Optional<JobRun> lastTriggered;
private final Optional<JobRun> lastCompleted;
private final Optional<JobRun> firstFailing;
@@ -42,7 +42,7 @@ public class JobStatus {
this.type = type;
this.jobError = jobError;
-
+
// Never say we triggered component because we don't:
this.lastTriggered = type == DeploymentJobs.JobType.component ? Optional.empty() : lastTriggered;
this.lastCompleted = lastCompleted;
@@ -52,7 +52,7 @@ public class JobStatus {
/** Returns an empty job status */
public static JobStatus initial(DeploymentJobs.JobType type) {
- return new JobStatus(type, Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty());
+ return new JobStatus(type, Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty());
}
public JobStatus withTriggering(Version version, Optional<ApplicationRevision> revision,
@@ -89,13 +89,13 @@ public class JobStatus {
Optional<JobRun> firstFailing = this.firstFailing;
if (jobError.isPresent() && ! this.firstFailing.isPresent())
firstFailing = Optional.of(thisCompletion);
-
+
Optional<JobRun> lastSuccess = this.lastSuccess;
if ( ! jobError.isPresent()) {
lastSuccess = Optional.of(thisCompletion);
firstFailing = Optional.empty();
}
-
+
return new JobStatus(type, jobError, lastTriggered, Optional.of(thisCompletion), firstFailing, lastSuccess);
}
@@ -105,7 +105,7 @@ public class JobStatus {
public boolean isSuccess() {
return lastCompleted().isPresent() && ! jobError.isPresent();
}
-
+
/** Returns true if last triggered is newer than last completed and was started after timeoutLimit */
public boolean isRunning(Instant timeoutLimit) {
if ( ! lastTriggered.isPresent()) return false;
@@ -114,6 +114,11 @@ public class JobStatus {
return ! lastTriggered.get().at().isBefore(lastCompleted.get().at());
}
+ /** Returns true if this is running and has been so since before the given limit */
+ public boolean isHanging(Instant timeoutLimit) {
+ return isRunning(Instant.MIN) && lastTriggered.get().at().isBefore(timeoutLimit.plusMillis(1));
+ }
+
/** The error of the last completion, or empty if the last run succeeded */
public Optional<DeploymentJobs.JobError> jobError() { return jobError; }
@@ -140,10 +145,10 @@ public class JobStatus {
", first failing: " + firstFailing.map(JobRun::toString).orElse("(not failing)") +
", lastSuccess: " + lastSuccess.map(JobRun::toString).orElse("(never)") + "]";
}
-
+
@Override
public int hashCode() { return Objects.hash(type, jobError, lastTriggered, lastCompleted, firstFailing, lastSuccess); }
-
+
@Override
public boolean equals(Object o) {
if (o == this) return true;
@@ -159,15 +164,15 @@ public class JobStatus {
/** Information about a particular triggering or completion of a run of a job. This is immutable. */
public static class JobRun {
-
+
private final long id;
private final Version version;
private final Optional<ApplicationRevision> revision;
private final boolean upgrade;
private final String reason;
private final Instant at;
-
- public JobRun(long id, Version version, Optional<ApplicationRevision> revision,
+
+ public JobRun(long id, Version version, Optional<ApplicationRevision> revision,
boolean upgrade, String reason, Instant at) {
Objects.requireNonNull(version, "version cannot be null");
Objects.requireNonNull(revision, "revision cannot be null");
@@ -188,16 +193,16 @@ public class JobStatus {
// TODO: Fix how this is set, and add an applicationChange() method as well, in the same vein.
/** Returns whether this job run was a Vespa upgrade */
public boolean upgrade() { return upgrade; }
-
+
/** Returns the Vespa version used on this run */
public Version version() { return version; }
-
+
/** Returns the application revision used for this run, or empty when not known */
public Optional<ApplicationRevision> revision() { return revision; }
-
+
/** Returns a human-readable reason for this particular job run */
public String reason() { return reason; }
-
+
/** Returns the time if this triggering or completion */
public Instant at() { return at; }
@@ -218,7 +223,7 @@ public class JobStatus {
public int hashCode() {
return Objects.hash(version, revision, upgrade, at);
}
-
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
@@ -234,7 +239,7 @@ public class JobStatus {
@Override
public String toString() { return "job run " + id + " of version " + (upgrade() ? "upgrade " : "") + version + " "
+ revision + " at " + at; }
-
+
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index f0c950b024b..192901165be 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -32,9 +32,9 @@ import java.util.logging.Logger;
/**
* Responsible for scheduling deployment jobs in a build system and keeping
* Application.deploying() in sync with what is scheduled.
- *
+ *
* This class is multithread safe.
- *
+ *
* @author bratseth
* @author mpolden
*/
@@ -60,7 +60,7 @@ public class DeploymentTrigger {
this.order = new DeploymentOrder(controller);
this.jobTimeout = controller.system().equals(SystemName.main) ? Duration.ofHours(12) : Duration.ofHours(1);
}
-
+
/** Returns the time in the past before which jobs are at this moment considered unresponsive */
public Instant jobTimeoutLimit() { return clock.instant().minus(jobTimeout); }
@@ -70,10 +70,10 @@ public class DeploymentTrigger {
//--- Start of methods which triggers deployment jobs -------------------------
- /**
+ /**
* Called each time a job completes (successfully or not) to cause triggering of one or more follow-up jobs
* (which may possibly the same job once over).
- *
+ *
* @param report information about the job that just completed
*/
public void triggerFromCompletion(JobReport report) {
@@ -143,10 +143,11 @@ public class DeploymentTrigger {
JobStatus systemTestStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest);
if (application.deploying().get() instanceof Change.VersionChange) {
Version target = ((Change.VersionChange) application.deploying().get()).version();
- if (systemTestStatus == null
+ if (systemTestStatus == null
|| ! systemTestStatus.lastTriggered().isPresent()
|| ! systemTestStatus.isSuccess()
- || ! systemTestStatus.lastTriggered().get().version().equals(target)) {
+ || ! systemTestStatus.lastTriggered().get().version().equals(target)
+ || systemTestStatus.isHanging(jobTimeoutLimit())) {
application = trigger(JobType.systemTest, application, false, "Upgrade to " + target);
controller.applications().store(application);
}
@@ -170,7 +171,7 @@ public class DeploymentTrigger {
List<JobType> nextToTrigger = new ArrayList<>();
for (JobType nextJobType : order.nextAfter(jobType, application)) {
JobStatus nextStatus = application.deploymentJobs().jobStatus().get(nextJobType);
- if (changesAvailable(application, jobStatus, nextStatus))
+ if (changesAvailable(application, jobStatus, nextStatus) || nextStatus.isHanging(jobTimeoutLimit()))
nextToTrigger.add(nextJobType);
}
// Trigger them in parallel
@@ -209,10 +210,10 @@ public class DeploymentTrigger {
return true;
return false;
}
-
+
/**
* Triggers a change of this application
- *
+ *
* @param applicationId the application to trigger
* @throws IllegalArgumentException if this application already have an ongoing change
*/
@@ -267,7 +268,7 @@ public class DeploymentTrigger {
}
/**
- * Trigger a job for an application
+ * Trigger a job for an application
*
* @param jobType the type of the job to trigger, or null to trigger nothing
* @param application the application to trigger the job for
@@ -289,7 +290,7 @@ public class DeploymentTrigger {
/**
* Trigger a job for an application, if allowed
- *
+ *
* @param jobType the type of the job to trigger, or null to trigger nothing
* @param application the application to trigger the job for
* @param first whether to trigger the job before other jobs
@@ -323,7 +324,7 @@ public class DeploymentTrigger {
/** Returns true if the given proposed job triggering should be effected */
private boolean allowedTriggering(JobType jobType, LockedApplication application) {
- // Note: We could make a more fine-grained and more correct determination about whether to block
+ // Note: We could make a more fine-grained and more correct determination about whether to block
// by instead basing the decision on what is currently deployed in the zone. However,
// this leads to some additional corner cases, and the possibility of blocking an application
// fix to a version upgrade, so not doing it now
@@ -341,7 +342,7 @@ public class DeploymentTrigger {
return true;
}
-
+
private boolean isRunningProductionJob(Application application) {
return JobList.from(application)
.production()
@@ -364,7 +365,7 @@ public class DeploymentTrigger {
if (existingDeployment == null) return false;
return existingDeployment.version().isAfter(version);
}
-
+
private boolean acceptNewRevisionNow(LockedApplication application) {
if ( ! application.deploying().isPresent()) return true;
@@ -377,5 +378,5 @@ public class DeploymentTrigger {
// Otherwise, the application is currently upgrading, without failures, and we should wait with the revision.
return false;
}
-
+
}