diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-10-05 14:32:10 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-10-05 14:32:10 +0200 |
commit | 4fb17280a8fd1532965f06a624b52511351c16ca (patch) | |
tree | 5772271ad62390214bf946b5e3bfe5a7e6d23af1 /controller-server | |
parent | 07368c537b93757ac8b36f35838eeb0d8ef3b507 (diff) |
Don't retry jobs that are running
Diffstat (limited to 'controller-server')
2 files changed, 8 insertions, 14 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 0268e187f35..9b2158e161e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -115,6 +115,7 @@ public class DeploymentTrigger { break; } } + // Retry dead job Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs(), timeout); if (firstDeadJob.isPresent()) { @@ -213,16 +214,15 @@ public class DeploymentTrigger { /** Decide whether the job should be triggered by the periodic trigger */ private boolean shouldRetryNow(JobStatus job) { if (job.isSuccess()) return false; + if (job.inProgress()) return false; - if ( ! job.lastCompleted().isPresent()) return true; // Retry when we don't hear back + // Retry after 10% of the time since it started failing + Duration aTenthOfFailTime = Duration.ofMillis( (clock.millis() - job.firstFailing().get().at().toEpochMilli()) / 10); + if (job.lastCompleted().get().at().isBefore(clock.instant().minus(aTenthOfFailTime))) return true; - // Always retry if we haven't tried in 4 hours + // ... or retry anyway if we haven't tried in 4 hours if (job.lastCompleted().get().at().isBefore(clock.instant().minus(Duration.ofHours(4)))) return true; - // Wait for 10% of the time since it started failing - Duration aTenthOfFailTime = Duration.ofMillis( (clock.millis() - job.firstFailing().get().at().toEpochMilli()) / 10); - if (job.lastCompleted().get().at().isBefore(clock.instant().minus(aTenthOfFailTime))) return true; - return false; } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 5a48bc54b49..5abadf28cfb 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -53,15 +53,9 @@ public class DeploymentTriggerTest { tester.buildSystem().takeJobsToRun(); assertEquals("Job removed", 0, tester.buildSystem().jobs().size()); tester.clock().advance(Duration.ofHours(4).plus(Duration.ofSeconds(1))); - tester.failureRedeployer().maintain(); + tester.failureRedeployer().maintain(); // Causes retry of systemTests - assertEquals("Retried job", 1, tester.buildSystem().jobs().size()); - assertEquals(JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName()); - tester.buildSystem().takeJobsToRun(); - assertEquals("Job removed", 0, tester.buildSystem().jobs().size()); - - // system-test succeeds and staging-test starts - tester.failureRedeployer().maintain(); + assertEquals("Scheduled retry", 1, tester.buildSystem().jobs().size()); tester.deployAndNotify(app, applicationPackage, true, JobType.systemTest); // staging-test times out and is retried |