diff options
author | Martin Polden <mpolden@mpolden.no> | 2017-10-11 12:29:46 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-11 12:29:46 +0200 |
commit | ed9f5bf9a8ff3fad1dd19df7569f4ccb4844db96 (patch) | |
tree | a8765570f1f41b3c3e2ab5747a24456ba9e32c36 /controller-server | |
parent | 41954227a91b74499f24326ab288ea21f8eae604 (diff) | |
parent | 9b20ec1e291192b073a767bb0c0612187c5dd4cc (diff) |
Merge pull request #3704 from vespa-engine/bratseth/remove-in-progress
Bratseth/remove in progress
Diffstat (limited to 'controller-server')
6 files changed, 48 insertions, 40 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java index b4a83528816..7ff5a23e178 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java @@ -140,9 +140,9 @@ public class ApplicationList { return listOf(list.stream().sorted(Comparator.comparing(application -> application.deployedVersion().orElse(Version.emptyVersion)))); } - /** Returns the subset of applications that are not upgrading or started upgrading before the grace period */ - public ApplicationList notCurrentlyUpgrading(Change.VersionChange change, Instant startOfGracePeriod) { - return listOf(list.stream().filter(a -> !currentlyUpgrading(change, a, startOfGracePeriod))); + /** Returns the subset of applications that are not currently upgrading */ + public ApplicationList notCurrentlyUpgrading(Change.VersionChange change, Instant jobTimeoutLimit) { + return listOf(list.stream().filter(a -> !currentlyUpgrading(change, a, jobTimeoutLimit))); } // ----------------------------------- Internal helpers @@ -170,12 +170,11 @@ public class ApplicationList { return false; } - private static boolean currentlyUpgrading(Change.VersionChange change, Application application, Instant instant) { + private static boolean currentlyUpgrading(Change.VersionChange change, Application application, Instant jobTimeoutLimit) { return application.deploymentJobs().jobStatus().values().stream() - .filter(JobStatus::inProgress) + .filter(status -> status.isRunning(jobTimeoutLimit)) .filter(status -> status.lastTriggered().isPresent()) .map(status -> status.lastTriggered().get()) - .filter(jobRun -> jobRun.at().isAfter(instant)) .anyMatch(jobRun -> jobRun.version().equals(change.version())); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java index 26cf44002ff..1ffa06bb624 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java @@ -101,8 +101,15 @@ public class DeploymentJobs { } /** Returns whether any job is currently in progress */ - public boolean inProgress() { - return status.values().stream().anyMatch(JobStatus::inProgress); + public boolean isRunning(Instant timeoutLimit) { + return status.values().stream().anyMatch(job -> job.isRunning(timeoutLimit)); + } + + /** Returns whether the given job type is currently running and was started after timeoutLimit */ + public boolean isRunning(JobType jobType, Instant timeoutLimit) { + JobStatus jobStatus = status.get(jobType); + if ( jobStatus == null) return false; + return jobStatus.isRunning(timeoutLimit); } /** Returns whether change can be deployed to the given environment */ @@ -310,11 +317,4 @@ public class DeploymentJobs { return id; } - /** Returns whether the given job type is currently running and was started after timeoutLimit */ - public boolean isRunning(JobType jobType, Instant timeoutLimit) { - JobStatus jobStatus = status.get(jobType); - if ( jobStatus == null) return false; - return jobStatus.isRunning(timeoutLimit); - } - }
\ No newline at end of file diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java index cb5d62cda1b..1c8aa2adada 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java @@ -111,18 +111,6 @@ public class JobStatus { /** The error of the last completion, or empty if the last run succeeded */ public Optional<DeploymentJobs.JobError> jobError() { return jobError; } - /** Returns true if job is in progress */ - // TODO: Replace with isRunning - public boolean inProgress() { - if (!lastTriggered().isPresent()) { - return false; - } - if (!lastCompleted().isPresent()) { - return true; - } - return lastTriggered().get().at().isAfter(lastCompleted().get().at()); - } - /** * Returns the last triggering of this job, or empty if the controller has never triggered it * and not seen a deployment for it diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 18d846f48e5..a5d7786f3e9 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -57,6 +57,9 @@ public class DeploymentTrigger { this.order = new DeploymentOrder(controller); } + /** Returns the time in the past before which jobs are at this moment considered unresponsive */ + public Instant jobTimeoutLimit() { return clock.instant().minus(jobTimeout); } + //--- Start of methods which triggers deployment jobs ------------------------- /** @@ -127,7 +130,7 @@ public class DeploymentTrigger { // TODO: Do this for all jobs not just staging, and (with more work) remove triggerFailing and triggerDelayed if (jobType.environment().equals(Environment.staging)) { JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - if (jobStatus.isRunning(clock.instant().minus(jobTimeout))) continue; + if (jobStatus.isRunning(jobTimeoutLimit())) continue; for (JobType nextJobType : order.nextAfter(jobType, application)) { JobStatus nextStatus = application.deploymentJobs().jobStatus().get(nextJobType); @@ -181,7 +184,7 @@ public class DeploymentTrigger { } // Retry dead job - Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs(), jobTimeout); + Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs()); if (firstDeadJob.isPresent()) { application = trigger(firstDeadJob.get().type(), application, false, "Retrying dead job", lock); @@ -195,7 +198,7 @@ public class DeploymentTrigger { for (Application application : applications().asList()) { if ( ! application.deploying().isPresent() ) continue; if (application.deploymentJobs().hasFailures()) continue; - if (application.deploymentJobs().inProgress()) continue; + if (application.deploymentJobs().isRunning(controller.applications().deploymentTrigger().jobTimeoutLimit())) continue; if (application.deploymentSpec().steps().stream().noneMatch(step -> step instanceof DeploymentSpec.Delay)) { continue; // Application does not have any delayed deployments } @@ -266,19 +269,18 @@ public class DeploymentTrigger { } /** Returns the first job that has been running for more than the given timeout */ - private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs, Duration timeout) { - Instant startOfGracePeriod = controller.clock().instant().minus(timeout); + private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs) { Optional<JobStatus> oldestRunningJob = jobs.jobStatus().values().stream() - .filter(JobStatus::inProgress) + .filter(job -> job.isRunning(Instant.ofEpochMilli(0))) .sorted(Comparator.comparing(status -> status.lastTriggered().get().at())) .findFirst(); - return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(startOfGracePeriod)); + return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(jobTimeoutLimit())); } /** Decide whether the job should be triggered by the periodic trigger */ private boolean shouldRetryNow(JobStatus job) { if (job.isSuccess()) return false; - if (job.inProgress()) return false; + if (job.isRunning(jobTimeoutLimit())) return false; // Retry after 10% of the time since it started failing Duration aTenthOfFailTime = Duration.ofMillis( (clock.millis() - job.firstFailing().get().at().toEpochMilli()) / 10); @@ -342,7 +344,7 @@ public class DeploymentTrigger { return application; } - if (application.deploymentJobs().isRunning(jobType, clock.instant().minus(jobTimeout))) { + if (application.deploymentJobs().isRunning(jobType, jobTimeoutLimit())) { return application; } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index 122cb69b34f..0722a58e18d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -7,6 +7,7 @@ import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.Change; +import com.yahoo.vespa.hosted.controller.deployment.BuildSystem; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import com.yahoo.yolean.Exceptions; @@ -70,14 +71,13 @@ public class Upgrader extends Maintainer { private void upgrade(ApplicationList applications, Version version) { Change.VersionChange change = new Change.VersionChange(version); - Instant startOfUpgradePeriod = controller().clock().instant().minus(upgradeTimeout); cancelUpgradesOf(applications.upgradingToLowerThan(version)); applications = applications.notPullRequest(); // Pull requests are deployed as separate applications to test then deleted; No need to upgrade applications = applications.hasProductionDeployment(); applications = applications.onLowerVersionThan(version); applications = applications.notDeployingApplication(); // wait with applications deploying an application change applications = applications.notFailingOn(version); // try to upgrade only if it hasn't failed on this version - applications = applications.notCurrentlyUpgrading(change, startOfUpgradePeriod); // do not trigger again if currently upgrading + applications = applications.notCurrentlyUpgrading(change, controller().applications().deploymentTrigger().jobTimeoutLimit()); applications = applications.canUpgradeAt(controller().clock().instant()); // wait with applications that are currently blocking upgrades applications = applications.byIncreasingDeployedVersion(); // start with lowest versions applications = applications.first(numberOfApplicationsToUpgrade()); // throttle upgrades diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index a2a71f24275..29f1bce5ebe 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -418,6 +418,8 @@ public class UpgraderTest { Application default2 = tester.createAndDeploy("default2", 5, "default"); Application default3 = tester.createAndDeploy("default3", 6, "default"); Application default4 = tester.createAndDeploy("default4", 7, "default"); + + assertEquals(version, default0.deployedVersion().get()); // New version is released version = Version.fromString("5.1"); @@ -445,9 +447,9 @@ public class UpgraderTest { assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); - // 5th app never reports back and has a dead locked job, but no ongoing change + // 5th app never reports back and has a dead job, but no ongoing change Application deadLocked = tester.applications().require(default4.id()); - assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress()); + assertTrue("Jobs in progress", deadLocked.deploymentJobs().isRunning(tester.controller().applications().deploymentTrigger().jobTimeoutLimit())); assertFalse("No change present", deadLocked.deploying().isPresent()); // 4/5 applications are repaired and confidence is restored @@ -455,14 +457,31 @@ public class UpgraderTest { tester.deployCompletely(default1, applicationPackage); tester.deployCompletely(default2, applicationPackage); tester.deployCompletely(default3, applicationPackage); + tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + tester.upgrader().maintain(); + assertEquals("Upgrade scheduled for previously failing apps", 4, tester.buildSystem().jobs().size()); + tester.completeUpgrade(default0, version, "default"); + tester.completeUpgrade(default1, version, "default"); + tester.completeUpgrade(default2, version, "default"); + tester.completeUpgrade(default3, version, "default"); + + assertEquals(version, tester.application(default0.id()).deployedVersion().get()); + assertEquals(version, tester.application(default1.id()).deployedVersion().get()); + assertEquals(version, tester.application(default2.id()).deployedVersion().get()); + assertEquals(version, tester.application(default3.id()).deployedVersion().get()); + // Over 12 hours pass and upgrade is rescheduled for 5th app + assertEquals(0, tester.buildSystem().jobs().size()); tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); tester.upgrader().maintain(); + assertEquals(1, tester.buildSystem().jobs().size()); assertEquals("Upgrade is rescheduled", DeploymentJobs.JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName()); + tester.deployCompletely(default4, applicationPackage); + assertEquals(version, tester.application(default4.id()).deployedVersion().get()); } @Test |