diff options
author | Martin Polden <mpolden@mpolden.no> | 2017-09-29 11:35:00 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2017-09-29 14:21:32 +0200 |
commit | b9dab0a9e5e75bb585a9cb73696c816d1a19d27f (patch) | |
tree | cec31570f3b3bda849a5fef9e7277d1691bedf6c /controller-server | |
parent | df59841596f54ac9a7493edb61ab3166673af4c7 (diff) |
Always retry failing deployment from first problematic job
Diffstat (limited to 'controller-server')
4 files changed, 57 insertions, 159 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index d2af5fc50cb..c43225fbd2b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.application.Change; +import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobError; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobReport; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType; @@ -31,6 +32,7 @@ import java.util.logging.Logger; * This class is multithread safe. * * @author bratseth + * @author mpolden */ public class DeploymentTrigger { @@ -97,27 +99,30 @@ public class DeploymentTrigger { /** * Called periodically to cause triggering of jobs in the background */ - public void triggerFailing(ApplicationId applicationId, String cause) { + public void triggerFailing(ApplicationId applicationId, Duration timeout) { try (Lock lock = applications().lock(applicationId)) { Application application = applications().require(applicationId); - if (shouldRetryFromBeginning(application)) { - // failed for a long time: Discard existing change and restart from the component job - application = application.withDeploying(Optional.empty()); - application = trigger(JobType.component, application, false, "Retrying failing deployment from beginning: " + cause, lock); - applications().store(application, lock); - } else { - // retry the failed job (with backoff) - for (JobType jobType : order.jobsFrom(application.deploymentSpec())) { // retry the *first* failing job - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - if (isFailing(jobStatus)) { - if (shouldRetryNow(jobStatus)) { - application = trigger(jobType, application, false, "Retrying failing job: " + cause, lock); - applications().store(application, lock); - } - break; + if (!application.deploying().isPresent()) { // No ongoing change, no need to retry + return; + } + // Retry first failing job + for (JobType jobType : order.jobsFrom(application.deploymentSpec())) { + JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); + if (isFailing(application.deploying().get(), jobStatus)) { + if (shouldRetryNow(jobStatus)) { + application = trigger(jobType, application, false, "Retrying failing job", lock); + applications().store(application, lock); } + break; } } + // Retry dead job + Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs(), timeout); + if (firstDeadJob.isPresent()) { + application = trigger(firstDeadJob.get().type(), application, false, "Retrying dead job", + lock); + applications().store(application, lock); + } } } @@ -184,24 +189,26 @@ public class DeploymentTrigger { //--- End of methods which triggers deployment jobs ---------------------------- private ApplicationController applications() { return controller.applications(); } - - private boolean isFailing(JobStatus jobStatusOrNull) { - return jobStatusOrNull != null && !jobStatusOrNull.isSuccess(); + + /** Returns whether a job is failing for the current change in the given application */ + private boolean isFailing(Change change, JobStatus status) { + return status != null && + !status.isSuccess() && + status.lastCompletedFor(change); } private boolean isCapacityConstrained(JobType jobType) { return jobType == JobType.stagingTest || jobType == JobType.systemTest; } - private boolean shouldRetryFromBeginning(Application application) { - Instant eightHoursAgo = clock.instant().minus(Duration.ofHours(8)); - Instant failingSince = application.deploymentJobs().failingSince(); - if (failingSince != null && failingSince.isAfter(eightHoursAgo)) return false; - - JobStatus componentJobStatus = application.deploymentJobs().jobStatus().get(JobType.component); - if (componentJobStatus == null) return true; - if ( ! componentJobStatus.lastCompleted().isPresent() ) return true; - return componentJobStatus.lastCompleted().get().at().isBefore(eightHoursAgo); + /** Returns the first job that has been running for more than the given timeout */ + private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs, Duration timeout) { + Instant startOfGracePeriod = controller.clock().instant().minus(timeout); + Optional<JobStatus> oldestRunningJob = jobs.jobStatus().values().stream() + .filter(JobStatus::inProgress) + .sorted(Comparator.comparing(status -> status.lastTriggered().get().at())) + .findFirst(); + return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(startOfGracePeriod)); } /** Decide whether the job should be triggered by the periodic trigger */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java index 43c1755af34..c8831bd6a56 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java @@ -4,13 +4,9 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.application.ApplicationList; -import com.yahoo.vespa.hosted.controller.application.JobStatus; import java.time.Duration; -import java.time.Instant; -import java.util.Comparator; import java.util.List; -import java.util.Optional; /** * Attempts redeployment of failed jobs and deployments. @@ -31,57 +27,11 @@ public class FailureRedeployer extends Maintainer { List<Application> applications = ApplicationList.from(controller().applications().asList()) .notPullRequest() .asList(); - retryFailingJobs(applications); - retryStuckJobs(applications); + applications.forEach(application -> triggerFailing(application, jobTimeout)); } - private void retryFailingJobs(List<Application> applications) { - for (Application application : applications) { - if (!application.deploying().isPresent()) { - continue; - } - if (application.deploymentJobs().inProgress()) { - continue; - } - Optional<JobStatus> failingJob = jobFailingFor(application); - failingJob.ifPresent(job -> triggerFailing(application, "Job " + job.type().id() + - " has been failing since " + job.firstFailing().get())); - } - } - - private void retryStuckJobs(List<Application> applications) { - Instant startOfGracePeriod = controller().clock().instant().minus(jobTimeout); - for (Application application : applications) { - Optional<JobStatus> job = oldestRunningJob(application); - if (!job.isPresent()) { - continue; - } - // Ignore job if it doesn't belong to a zone in this system - if (!job.get().type().zone(controller().system()).isPresent()) { - continue; - } - if (job.get().lastTriggered().get().at().isBefore(startOfGracePeriod)) { - triggerFailing(application, "Job " + job.get().type().id() + - " has been running for more than " + jobTimeout); - } - } - } - - private Optional<JobStatus> jobFailingFor(Application application) { - return application.deploymentJobs().jobStatus().values().stream() - .filter(status -> !status.isSuccess() && status.lastCompletedFor(application.deploying().get())) - .findFirst(); - } - - private Optional<JobStatus> oldestRunningJob(Application application) { - return application.deploymentJobs().jobStatus().values().stream() - .filter(JobStatus::inProgress) - .sorted(Comparator.comparing(status -> status.lastTriggered().get().at())) - .findFirst(); - } - - private void triggerFailing(Application application, String cause) { - controller().applications().deploymentTrigger().triggerFailing(application.id(), cause); + private void triggerFailing(Application application, Duration timeout) { + controller().applications().deploymentTrigger().triggerFailing(application.id(), timeout); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 0c0cc0485c8..5a48bc54b49 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -47,22 +47,29 @@ public class DeploymentTriggerTest { tester.updateVersionStatus(version); tester.upgrader().maintain(); + // system-test fails and is retried tester.deployAndNotify(app, applicationPackage, false, JobType.systemTest); assertEquals("Retried immediately", 1, tester.buildSystem().jobs().size()); - tester.buildSystem().takeJobsToRun(); assertEquals("Job removed", 0, tester.buildSystem().jobs().size()); - tester.clock().advance(Duration.ofHours(2)); + tester.clock().advance(Duration.ofHours(4).plus(Duration.ofSeconds(1))); tester.failureRedeployer().maintain(); + assertEquals("Retried job", 1, tester.buildSystem().jobs().size()); assertEquals(JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName()); - tester.buildSystem().takeJobsToRun(); assertEquals("Job removed", 0, tester.buildSystem().jobs().size()); + + // system-test succeeds and staging-test starts + tester.failureRedeployer().maintain(); + tester.deployAndNotify(app, applicationPackage, true, JobType.systemTest); + + // staging-test times out and is retried + tester.buildSystem().takeJobsToRun(); tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); tester.failureRedeployer().maintain(); - assertEquals("Retried from the beginning", 1, tester.buildSystem().jobs().size()); - assertEquals(JobType.component.id(), tester.buildSystem().jobs().get(0).jobName()); + assertEquals("Retried dead job", 1, tester.buildSystem().jobs().size()); + assertEquals(JobType.stagingTest.id(), tester.buildSystem().jobs().get(0).jobName()); } @Test diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java index 2d1b7463dea..286db864c22 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java @@ -13,7 +13,6 @@ import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.persistence.ApplicationSerializer; -import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Test; import java.nio.file.Files; @@ -116,81 +115,16 @@ public class FailureRedeployerTest { tester.failureRedeployer().maintain(); assertTrue("No jobs retried", tester.buildSystem().jobs().isEmpty()); - // Just over 12 hours pass, deployment is retried from beginning + // Just over 12 hours pass, job is retried tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); tester.failureRedeployer().maintain(); - assertEquals(DeploymentJobs.JobType.component.id(), tester.buildSystem().takeJobsToRun().get(0).jobName()); - - // Ensure that system-test is triggered after component. Triggering component records a new change, but in this - // case there's already a change in progress which we want to discard and start over - tester.notifyJobCompletion(DeploymentJobs.JobType.component, app, true); - assertEquals(DeploymentJobs.JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName()); - } - - @Test - public void testAlwaysRestartsDeploymentOfApplicationsWithStuckJobs() { - DeploymentTester tester = new DeploymentTester(); - Version version = Version.fromString("5.0"); - tester.updateVersionStatus(version); - - ApplicationPackage applicationPackage = new ApplicationPackageBuilder() - .environment(Environment.prod) - .region("us-west-1") - .build(); - - // Setup applications - Application canary0 = tester.createAndDeploy("canary0", 0, "canary"); - Application canary1 = tester.createAndDeploy("canary1", 1, "canary"); - Application default0 = tester.createAndDeploy("default0", 2, "default"); - Application default1 = tester.createAndDeploy("default1", 3, "default"); - Application default2 = tester.createAndDeploy("default2", 4, "default"); - Application default3 = tester.createAndDeploy("default3", 5, "default"); - Application default4 = tester.createAndDeploy("default4", 6, "default"); - - // New version is released - version = Version.fromString("5.1"); - tester.updateVersionStatus(version); - assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); - tester.upgrader().maintain(); - - // Canaries upgrade and raise confidence - tester.completeUpgrade(canary0, version, "canary"); - tester.completeUpgrade(canary1, version, "canary"); - tester.updateVersionStatus(version); - assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); - - // Applications with default policy start upgrading - tester.clock().advance(Duration.ofMinutes(1)); - tester.upgrader().maintain(); - assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size()); - - // 4/5 applications fail, confidence is lowered and upgrade is cancelled - tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest); - tester.updateVersionStatus(version); - assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); - tester.upgrader().maintain(); - - // 5th app never reports back and has a dead locked job, but no ongoing change - Application deadLocked = tester.applications().require(default4.id()); - assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress()); - assertFalse("No change present", deadLocked.deploying().isPresent()); - - // 4/5 applications are repaired and confidence is restored - tester.deployCompletely(default0, applicationPackage); - tester.deployCompletely(default1, applicationPackage); - tester.deployCompletely(default2, applicationPackage); - tester.deployCompletely(default3, applicationPackage); - tester.updateVersionStatus(version); - assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + assertEquals(DeploymentJobs.JobType.stagingTest.id(), tester.buildSystem().takeJobsToRun().get(0).jobName()); - // Over 12 hours pass and failure redeployer restarts deployment of 5th app - tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); - tester.failureRedeployer().maintain(); - assertEquals("Deployment is restarted", DeploymentJobs.JobType.component.id(), - tester.buildSystem().jobs().get(0).jobName()); + // Deployment completes + tester.deploy(DeploymentJobs.JobType.stagingTest, app, applicationPackage, true); + tester.notifyJobCompletion(DeploymentJobs.JobType.stagingTest, app, true); + tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.productionUsEast3); + assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty()); } @Test |