diff options
author | Harald Musum <musum@yahoo-inc.com> | 2017-09-29 18:35:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-09-29 18:35:32 +0200 |
commit | 61679bfc56d0f9c5df09e92b70375ec1262179f0 (patch) | |
tree | 7e083745a8e04c6621073fede79899552165dbf8 /controller-server/src | |
parent | 2d5eb7d0976ac0650ab1d2acfcca4984252f8349 (diff) | |
parent | dfe49809cd3d23c9d7e059ddbf6aee4f763942ab (diff) |
Merge pull request #3585 from vespa-engine/mpolden/retry-from-first-job
Retry from first job
Diffstat (limited to 'controller-server/src')
7 files changed, 135 insertions, 166 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java index fd3fa867c99..ab1b5aa3cd4 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java @@ -127,9 +127,9 @@ public class ApplicationList { return listOf(list.stream().sorted(Comparator.comparing(application -> application.deployedVersion().orElse(Version.emptyVersion)))); } - /** Returns the subset of applications which currently do not have any job in progress for the given change */ - public ApplicationList notRunningJobFor(Change.VersionChange change) { - return listOf(list.stream().filter(a -> !hasRunningJob(a, change))); + /** Returns the subset of applications that are not upgrading or started upgrading before the grace period */ + public ApplicationList notCurrentlyUpgrading(Change.VersionChange change, Instant startOfGracePeriod) { + return listOf(list.stream().filter(a -> !currentlyUpgrading(change, a, startOfGracePeriod))); } // ----------------------------------- Internal helpers @@ -157,11 +157,12 @@ public class ApplicationList { return false; } - private static boolean hasRunningJob(Application application, Change.VersionChange change) { + private static boolean currentlyUpgrading(Change.VersionChange change, Application application, Instant instant) { return application.deploymentJobs().jobStatus().values().stream() .filter(JobStatus::inProgress) - .filter(jobStatus -> jobStatus.lastTriggered().isPresent()) - .map(jobStatus -> jobStatus.lastTriggered().get()) + .filter(status -> status.lastTriggered().isPresent()) + .map(status -> status.lastTriggered().get()) + .filter(jobRun -> jobRun.at().isAfter(instant)) .anyMatch(jobRun -> jobRun.version().equals(change.version())); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 46f11ef208c..268965850ba 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.application.Change; +import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobError; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobReport; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType; @@ -31,6 +32,7 @@ import java.util.logging.Logger; * This class is multithread safe. * * @author bratseth + * @author mpolden */ public class DeploymentTrigger { @@ -97,27 +99,30 @@ public class DeploymentTrigger { /** * Called periodically to cause triggering of jobs in the background */ - public void triggerFailing(ApplicationId applicationId, String cause) { + public void triggerFailing(ApplicationId applicationId, Duration timeout) { try (Lock lock = applications().lock(applicationId)) { Application application = applications().require(applicationId); - if (shouldRetryFromBeginning(application)) { - // failed for a long time: Discard existing change and restart from the component job - application = application.withDeploying(Optional.empty()); - application = trigger(JobType.component, application, false, "Retrying failing deployment from beginning: " + cause, lock); - applications().store(application, lock); - } else { - // retry the failed job (with backoff) - for (JobType jobType : order.jobsFrom(application.deploymentSpec())) { // retry the *first* failing job - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - if (isFailing(jobStatus)) { - if (shouldRetryNow(jobStatus)) { - application = trigger(jobType, application, false, "Retrying failing job: " + cause, lock); - applications().store(application, lock); - } - break; + if (!application.deploying().isPresent()) { // No ongoing change, no need to retry + return; + } + // Retry first failing job + for (JobType jobType : order.jobsFrom(application.deploymentSpec())) { + JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); + if (isFailing(application.deploying().get(), jobStatus)) { + if (shouldRetryNow(jobStatus)) { + application = trigger(jobType, application, false, "Retrying failing job", lock); + applications().store(application, lock); } + break; } } + // Retry dead job + Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs(), timeout); + if (firstDeadJob.isPresent()) { + application = trigger(firstDeadJob.get().type(), application, false, "Retrying dead job", + lock); + applications().store(application, lock); + } } } @@ -184,24 +189,26 @@ public class DeploymentTrigger { //--- End of methods which triggers deployment jobs ---------------------------- private ApplicationController applications() { return controller.applications(); } - - private boolean isFailing(JobStatus jobStatusOrNull) { - return jobStatusOrNull != null && !jobStatusOrNull.isSuccess(); + + /** Returns whether a job is failing for the current change in the given application */ + private boolean isFailing(Change change, JobStatus status) { + return status != null && + !status.isSuccess() && + status.lastCompletedFor(change); } private boolean isCapacityConstrained(JobType jobType) { return jobType == JobType.stagingTest || jobType == JobType.systemTest; } - private boolean shouldRetryFromBeginning(Application application) { - Instant eightHoursAgo = clock.instant().minus(Duration.ofHours(8)); - Instant failingSince = application.deploymentJobs().failingSince(); - if (failingSince != null && failingSince.isAfter(eightHoursAgo)) return false; - - JobStatus componentJobStatus = application.deploymentJobs().jobStatus().get(JobType.component); - if (componentJobStatus == null) return true; - if ( ! componentJobStatus.lastCompleted().isPresent() ) return true; - return componentJobStatus.lastCompleted().get().at().isBefore(eightHoursAgo); + /** Returns the first job that has been running for more than the given timeout */ + private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs, Duration timeout) { + Instant startOfGracePeriod = controller.clock().instant().minus(timeout); + Optional<JobStatus> oldestRunningJob = jobs.jobStatus().values().stream() + .filter(JobStatus::inProgress) + .sorted(Comparator.comparing(status -> status.lastTriggered().get().at())) + .findFirst(); + return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(startOfGracePeriod)); } /** Decide whether the job should be triggered by the periodic trigger */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java index 43c1755af34..c8831bd6a56 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java @@ -4,13 +4,9 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.application.ApplicationList; -import com.yahoo.vespa.hosted.controller.application.JobStatus; import java.time.Duration; -import java.time.Instant; -import java.util.Comparator; import java.util.List; -import java.util.Optional; /** * Attempts redeployment of failed jobs and deployments. @@ -31,57 +27,11 @@ public class FailureRedeployer extends Maintainer { List<Application> applications = ApplicationList.from(controller().applications().asList()) .notPullRequest() .asList(); - retryFailingJobs(applications); - retryStuckJobs(applications); + applications.forEach(application -> triggerFailing(application, jobTimeout)); } - private void retryFailingJobs(List<Application> applications) { - for (Application application : applications) { - if (!application.deploying().isPresent()) { - continue; - } - if (application.deploymentJobs().inProgress()) { - continue; - } - Optional<JobStatus> failingJob = jobFailingFor(application); - failingJob.ifPresent(job -> triggerFailing(application, "Job " + job.type().id() + - " has been failing since " + job.firstFailing().get())); - } - } - - private void retryStuckJobs(List<Application> applications) { - Instant startOfGracePeriod = controller().clock().instant().minus(jobTimeout); - for (Application application : applications) { - Optional<JobStatus> job = oldestRunningJob(application); - if (!job.isPresent()) { - continue; - } - // Ignore job if it doesn't belong to a zone in this system - if (!job.get().type().zone(controller().system()).isPresent()) { - continue; - } - if (job.get().lastTriggered().get().at().isBefore(startOfGracePeriod)) { - triggerFailing(application, "Job " + job.get().type().id() + - " has been running for more than " + jobTimeout); - } - } - } - - private Optional<JobStatus> jobFailingFor(Application application) { - return application.deploymentJobs().jobStatus().values().stream() - .filter(status -> !status.isSuccess() && status.lastCompletedFor(application.deploying().get())) - .findFirst(); - } - - private Optional<JobStatus> oldestRunningJob(Application application) { - return application.deploymentJobs().jobStatus().values().stream() - .filter(JobStatus::inProgress) - .sorted(Comparator.comparing(status -> status.lastTriggered().get().at())) - .findFirst(); - } - - private void triggerFailing(Application application, String cause) { - controller().applications().deploymentTrigger().triggerFailing(application.id(), cause); + private void triggerFailing(Application application, Duration timeout) { + controller().applications().deploymentTrigger().triggerFailing(application.id(), timeout); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index 54ef5df8ac0..fd742ab983a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -11,6 +11,7 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import com.yahoo.yolean.Exceptions; import java.time.Duration; +import java.time.Instant; import java.util.logging.Level; import java.util.logging.Logger; @@ -21,6 +22,8 @@ import java.util.logging.Logger; */ public class Upgrader extends Maintainer { + private static final Duration upgradeTimeout = Duration.ofHours(12); + private static final Logger log = Logger.getLogger(Upgrader.class.getName()); public Upgrader(Controller controller, Duration interval, JobControl jobControl) { @@ -62,12 +65,13 @@ public class Upgrader extends Maintainer { private void upgrade(ApplicationList applications, Version version) { Change.VersionChange change = new Change.VersionChange(version); + Instant startOfUpgradePeriod = controller().clock().instant().minus(upgradeTimeout); cancelUpgradesOf(applications.upgradingToLowerThan(version)); applications = applications.notPullRequest(); // Pull requests are deployed as separate applications to test then deleted; No need to upgrade applications = applications.onLowerVersionThan(version); applications = applications.notDeployingApplication(); // wait with applications deploying an application change applications = applications.notFailingOn(version); // try to upgrade only if it hasn't failed on this version - applications = applications.notRunningJobFor(change); // do not trigger multiple jobs simultaneously for same upgrade + applications = applications.notCurrentlyUpgrading(change, startOfUpgradePeriod); // do not trigger again if currently upgrading applications = applications.canUpgradeAt(controller().clock().instant()); // wait with applications that are currently blocking upgrades for (Application application : applications.byIncreasingDeployedVersion().asList()) { try { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 0c0cc0485c8..5a48bc54b49 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -47,22 +47,29 @@ public class DeploymentTriggerTest { tester.updateVersionStatus(version); tester.upgrader().maintain(); + // system-test fails and is retried tester.deployAndNotify(app, applicationPackage, false, JobType.systemTest); assertEquals("Retried immediately", 1, tester.buildSystem().jobs().size()); - tester.buildSystem().takeJobsToRun(); assertEquals("Job removed", 0, tester.buildSystem().jobs().size()); - tester.clock().advance(Duration.ofHours(2)); + tester.clock().advance(Duration.ofHours(4).plus(Duration.ofSeconds(1))); tester.failureRedeployer().maintain(); + assertEquals("Retried job", 1, tester.buildSystem().jobs().size()); assertEquals(JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName()); - tester.buildSystem().takeJobsToRun(); assertEquals("Job removed", 0, tester.buildSystem().jobs().size()); + + // system-test succeeds and staging-test starts + tester.failureRedeployer().maintain(); + tester.deployAndNotify(app, applicationPackage, true, JobType.systemTest); + + // staging-test times out and is retried + tester.buildSystem().takeJobsToRun(); tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); tester.failureRedeployer().maintain(); - assertEquals("Retried from the beginning", 1, tester.buildSystem().jobs().size()); - assertEquals(JobType.component.id(), tester.buildSystem().jobs().get(0).jobName()); + assertEquals("Retried dead job", 1, tester.buildSystem().jobs().size()); + assertEquals(JobType.stagingTest.id(), tester.buildSystem().jobs().get(0).jobName()); } @Test diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java index 2d1b7463dea..286db864c22 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java @@ -13,7 +13,6 @@ import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.persistence.ApplicationSerializer; -import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Test; import java.nio.file.Files; @@ -116,81 +115,16 @@ public class FailureRedeployerTest { tester.failureRedeployer().maintain(); assertTrue("No jobs retried", tester.buildSystem().jobs().isEmpty()); - // Just over 12 hours pass, deployment is retried from beginning + // Just over 12 hours pass, job is retried tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); tester.failureRedeployer().maintain(); - assertEquals(DeploymentJobs.JobType.component.id(), tester.buildSystem().takeJobsToRun().get(0).jobName()); - - // Ensure that system-test is triggered after component. Triggering component records a new change, but in this - // case there's already a change in progress which we want to discard and start over - tester.notifyJobCompletion(DeploymentJobs.JobType.component, app, true); - assertEquals(DeploymentJobs.JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName()); - } - - @Test - public void testAlwaysRestartsDeploymentOfApplicationsWithStuckJobs() { - DeploymentTester tester = new DeploymentTester(); - Version version = Version.fromString("5.0"); - tester.updateVersionStatus(version); - - ApplicationPackage applicationPackage = new ApplicationPackageBuilder() - .environment(Environment.prod) - .region("us-west-1") - .build(); - - // Setup applications - Application canary0 = tester.createAndDeploy("canary0", 0, "canary"); - Application canary1 = tester.createAndDeploy("canary1", 1, "canary"); - Application default0 = tester.createAndDeploy("default0", 2, "default"); - Application default1 = tester.createAndDeploy("default1", 3, "default"); - Application default2 = tester.createAndDeploy("default2", 4, "default"); - Application default3 = tester.createAndDeploy("default3", 5, "default"); - Application default4 = tester.createAndDeploy("default4", 6, "default"); - - // New version is released - version = Version.fromString("5.1"); - tester.updateVersionStatus(version); - assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); - tester.upgrader().maintain(); - - // Canaries upgrade and raise confidence - tester.completeUpgrade(canary0, version, "canary"); - tester.completeUpgrade(canary1, version, "canary"); - tester.updateVersionStatus(version); - assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); - - // Applications with default policy start upgrading - tester.clock().advance(Duration.ofMinutes(1)); - tester.upgrader().maintain(); - assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size()); - - // 4/5 applications fail, confidence is lowered and upgrade is cancelled - tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest); - tester.updateVersionStatus(version); - assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); - tester.upgrader().maintain(); - - // 5th app never reports back and has a dead locked job, but no ongoing change - Application deadLocked = tester.applications().require(default4.id()); - assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress()); - assertFalse("No change present", deadLocked.deploying().isPresent()); - - // 4/5 applications are repaired and confidence is restored - tester.deployCompletely(default0, applicationPackage); - tester.deployCompletely(default1, applicationPackage); - tester.deployCompletely(default2, applicationPackage); - tester.deployCompletely(default3, applicationPackage); - tester.updateVersionStatus(version); - assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + assertEquals(DeploymentJobs.JobType.stagingTest.id(), tester.buildSystem().takeJobsToRun().get(0).jobName()); - // Over 12 hours pass and failure redeployer restarts deployment of 5th app - tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); - tester.failureRedeployer().maintain(); - assertEquals("Deployment is restarted", DeploymentJobs.JobType.component.id(), - tester.buildSystem().jobs().get(0).jobName()); + // Deployment completes + tester.deploy(DeploymentJobs.JobType.stagingTest, app, applicationPackage, true); + tester.notifyJobCompletion(DeploymentJobs.JobType.stagingTest, app, true); + tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.productionUsEast3); + assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty()); } @Test diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 6709098ba06..5be030b4fd9 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -400,4 +400,70 @@ public class UpgraderTest { assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty()); } + @Test + public void testReschedulesUpgradeAfterTimeout() { + DeploymentTester tester = new DeploymentTester(); + Version version = Version.fromString("5.0"); + tester.updateVersionStatus(version); + + ApplicationPackage applicationPackage = new ApplicationPackageBuilder() + .environment(Environment.prod) + .region("us-west-1") + .build(); + + // Setup applications + Application canary0 = tester.createAndDeploy("canary0", 0, "canary"); + Application canary1 = tester.createAndDeploy("canary1", 1, "canary"); + Application default0 = tester.createAndDeploy("default0", 2, "default"); + Application default1 = tester.createAndDeploy("default1", 3, "default"); + Application default2 = tester.createAndDeploy("default2", 4, "default"); + Application default3 = tester.createAndDeploy("default3", 5, "default"); + Application default4 = tester.createAndDeploy("default4", 6, "default"); + + // New version is released + version = Version.fromString("5.1"); + tester.updateVersionStatus(version); + assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); + tester.upgrader().maintain(); + + // Canaries upgrade and raise confidence + tester.completeUpgrade(canary0, version, "canary"); + tester.completeUpgrade(canary1, version, "canary"); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Applications with default policy start upgrading + tester.clock().advance(Duration.ofMinutes(1)); + tester.upgrader().maintain(); + assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size()); + + // 4/5 applications fail, confidence is lowered and upgrade is cancelled + tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); + tester.upgrader().maintain(); + + // 5th app never reports back and has a dead locked job, but no ongoing change + Application deadLocked = tester.applications().require(default4.id()); + assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress()); + assertFalse("No change present", deadLocked.deploying().isPresent()); + + // 4/5 applications are repaired and confidence is restored + tester.deployCompletely(default0, applicationPackage); + tester.deployCompletely(default1, applicationPackage); + tester.deployCompletely(default2, applicationPackage); + tester.deployCompletely(default3, applicationPackage); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Over 12 hours pass and upgrade is rescheduled for 5th app + tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); + tester.upgrader().maintain(); + assertEquals("Upgrade is rescheduled", DeploymentJobs.JobType.systemTest.id(), + tester.buildSystem().jobs().get(0).jobName()); + } + } |