aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2017-10-11 12:29:46 +0200
committerGitHub <noreply@github.com>2017-10-11 12:29:46 +0200
commited9f5bf9a8ff3fad1dd19df7569f4ccb4844db96 (patch)
treea8765570f1f41b3c3e2ab5747a24456ba9e32c36 /controller-server
parent41954227a91b74499f24326ab288ea21f8eae604 (diff)
parent9b20ec1e291192b073a767bb0c0612187c5dd4cc (diff)
Merge pull request #3704 from vespa-engine/bratseth/remove-in-progress
Bratseth/remove in progress
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java11
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java18
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java12
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java20
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java4
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java23
6 files changed, 48 insertions, 40 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
index b4a83528816..7ff5a23e178 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
@@ -140,9 +140,9 @@ public class ApplicationList {
return listOf(list.stream().sorted(Comparator.comparing(application -> application.deployedVersion().orElse(Version.emptyVersion))));
}
- /** Returns the subset of applications that are not upgrading or started upgrading before the grace period */
- public ApplicationList notCurrentlyUpgrading(Change.VersionChange change, Instant startOfGracePeriod) {
- return listOf(list.stream().filter(a -> !currentlyUpgrading(change, a, startOfGracePeriod)));
+ /** Returns the subset of applications that are not currently upgrading */
+ public ApplicationList notCurrentlyUpgrading(Change.VersionChange change, Instant jobTimeoutLimit) {
+ return listOf(list.stream().filter(a -> !currentlyUpgrading(change, a, jobTimeoutLimit)));
}
// ----------------------------------- Internal helpers
@@ -170,12 +170,11 @@ public class ApplicationList {
return false;
}
- private static boolean currentlyUpgrading(Change.VersionChange change, Application application, Instant instant) {
+ private static boolean currentlyUpgrading(Change.VersionChange change, Application application, Instant jobTimeoutLimit) {
return application.deploymentJobs().jobStatus().values().stream()
- .filter(JobStatus::inProgress)
+ .filter(status -> status.isRunning(jobTimeoutLimit))
.filter(status -> status.lastTriggered().isPresent())
.map(status -> status.lastTriggered().get())
- .filter(jobRun -> jobRun.at().isAfter(instant))
.anyMatch(jobRun -> jobRun.version().equals(change.version()));
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java
index 26cf44002ff..1ffa06bb624 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java
@@ -101,8 +101,15 @@ public class DeploymentJobs {
}
/** Returns whether any job is currently in progress */
- public boolean inProgress() {
- return status.values().stream().anyMatch(JobStatus::inProgress);
+ public boolean isRunning(Instant timeoutLimit) {
+ return status.values().stream().anyMatch(job -> job.isRunning(timeoutLimit));
+ }
+
+ /** Returns whether the given job type is currently running and was started after timeoutLimit */
+ public boolean isRunning(JobType jobType, Instant timeoutLimit) {
+ JobStatus jobStatus = status.get(jobType);
+ if ( jobStatus == null) return false;
+ return jobStatus.isRunning(timeoutLimit);
}
/** Returns whether change can be deployed to the given environment */
@@ -310,11 +317,4 @@ public class DeploymentJobs {
return id;
}
- /** Returns whether the given job type is currently running and was started after timeoutLimit */
- public boolean isRunning(JobType jobType, Instant timeoutLimit) {
- JobStatus jobStatus = status.get(jobType);
- if ( jobStatus == null) return false;
- return jobStatus.isRunning(timeoutLimit);
- }
-
} \ No newline at end of file
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java
index cb5d62cda1b..1c8aa2adada 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java
@@ -111,18 +111,6 @@ public class JobStatus {
/** The error of the last completion, or empty if the last run succeeded */
public Optional<DeploymentJobs.JobError> jobError() { return jobError; }
- /** Returns true if job is in progress */
- // TODO: Replace with isRunning
- public boolean inProgress() {
- if (!lastTriggered().isPresent()) {
- return false;
- }
- if (!lastCompleted().isPresent()) {
- return true;
- }
- return lastTriggered().get().at().isAfter(lastCompleted().get().at());
- }
-
/**
* Returns the last triggering of this job, or empty if the controller has never triggered it
* and not seen a deployment for it
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index 18d846f48e5..a5d7786f3e9 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -57,6 +57,9 @@ public class DeploymentTrigger {
this.order = new DeploymentOrder(controller);
}
+ /** Returns the time in the past before which jobs are at this moment considered unresponsive */
+ public Instant jobTimeoutLimit() { return clock.instant().minus(jobTimeout); }
+
//--- Start of methods which triggers deployment jobs -------------------------
/**
@@ -127,7 +130,7 @@ public class DeploymentTrigger {
// TODO: Do this for all jobs not just staging, and (with more work) remove triggerFailing and triggerDelayed
if (jobType.environment().equals(Environment.staging)) {
JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType);
- if (jobStatus.isRunning(clock.instant().minus(jobTimeout))) continue;
+ if (jobStatus.isRunning(jobTimeoutLimit())) continue;
for (JobType nextJobType : order.nextAfter(jobType, application)) {
JobStatus nextStatus = application.deploymentJobs().jobStatus().get(nextJobType);
@@ -181,7 +184,7 @@ public class DeploymentTrigger {
}
// Retry dead job
- Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs(), jobTimeout);
+ Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs());
if (firstDeadJob.isPresent()) {
application = trigger(firstDeadJob.get().type(), application, false, "Retrying dead job",
lock);
@@ -195,7 +198,7 @@ public class DeploymentTrigger {
for (Application application : applications().asList()) {
if ( ! application.deploying().isPresent() ) continue;
if (application.deploymentJobs().hasFailures()) continue;
- if (application.deploymentJobs().inProgress()) continue;
+ if (application.deploymentJobs().isRunning(controller.applications().deploymentTrigger().jobTimeoutLimit())) continue;
if (application.deploymentSpec().steps().stream().noneMatch(step -> step instanceof DeploymentSpec.Delay)) {
continue; // Application does not have any delayed deployments
}
@@ -266,19 +269,18 @@ public class DeploymentTrigger {
}
/** Returns the first job that has been running for more than the given timeout */
- private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs, Duration timeout) {
- Instant startOfGracePeriod = controller.clock().instant().minus(timeout);
+ private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs) {
Optional<JobStatus> oldestRunningJob = jobs.jobStatus().values().stream()
- .filter(JobStatus::inProgress)
+ .filter(job -> job.isRunning(Instant.ofEpochMilli(0)))
.sorted(Comparator.comparing(status -> status.lastTriggered().get().at()))
.findFirst();
- return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(startOfGracePeriod));
+ return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(jobTimeoutLimit()));
}
/** Decide whether the job should be triggered by the periodic trigger */
private boolean shouldRetryNow(JobStatus job) {
if (job.isSuccess()) return false;
- if (job.inProgress()) return false;
+ if (job.isRunning(jobTimeoutLimit())) return false;
// Retry after 10% of the time since it started failing
Duration aTenthOfFailTime = Duration.ofMillis( (clock.millis() - job.firstFailing().get().at().toEpochMilli()) / 10);
@@ -342,7 +344,7 @@ public class DeploymentTrigger {
return application;
}
- if (application.deploymentJobs().isRunning(jobType, clock.instant().minus(jobTimeout))) {
+ if (application.deploymentJobs().isRunning(jobType, jobTimeoutLimit())) {
return application;
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
index 122cb69b34f..0722a58e18d 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
@@ -7,6 +7,7 @@ import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
import com.yahoo.vespa.hosted.controller.application.Change;
+import com.yahoo.vespa.hosted.controller.deployment.BuildSystem;
import com.yahoo.vespa.hosted.controller.persistence.CuratorDb;
import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import com.yahoo.yolean.Exceptions;
@@ -70,14 +71,13 @@ public class Upgrader extends Maintainer {
private void upgrade(ApplicationList applications, Version version) {
Change.VersionChange change = new Change.VersionChange(version);
- Instant startOfUpgradePeriod = controller().clock().instant().minus(upgradeTimeout);
cancelUpgradesOf(applications.upgradingToLowerThan(version));
applications = applications.notPullRequest(); // Pull requests are deployed as separate applications to test then deleted; No need to upgrade
applications = applications.hasProductionDeployment();
applications = applications.onLowerVersionThan(version);
applications = applications.notDeployingApplication(); // wait with applications deploying an application change
applications = applications.notFailingOn(version); // try to upgrade only if it hasn't failed on this version
- applications = applications.notCurrentlyUpgrading(change, startOfUpgradePeriod); // do not trigger again if currently upgrading
+ applications = applications.notCurrentlyUpgrading(change, controller().applications().deploymentTrigger().jobTimeoutLimit());
applications = applications.canUpgradeAt(controller().clock().instant()); // wait with applications that are currently blocking upgrades
applications = applications.byIncreasingDeployedVersion(); // start with lowest versions
applications = applications.first(numberOfApplicationsToUpgrade()); // throttle upgrades
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
index a2a71f24275..29f1bce5ebe 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
@@ -418,6 +418,8 @@ public class UpgraderTest {
Application default2 = tester.createAndDeploy("default2", 5, "default");
Application default3 = tester.createAndDeploy("default3", 6, "default");
Application default4 = tester.createAndDeploy("default4", 7, "default");
+
+ assertEquals(version, default0.deployedVersion().get());
// New version is released
version = Version.fromString("5.1");
@@ -445,9 +447,9 @@ public class UpgraderTest {
assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence());
tester.upgrader().maintain();
- // 5th app never reports back and has a dead locked job, but no ongoing change
+ // 5th app never reports back and has a dead job, but no ongoing change
Application deadLocked = tester.applications().require(default4.id());
- assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress());
+ assertTrue("Jobs in progress", deadLocked.deploymentJobs().isRunning(tester.controller().applications().deploymentTrigger().jobTimeoutLimit()));
assertFalse("No change present", deadLocked.deploying().isPresent());
// 4/5 applications are repaired and confidence is restored
@@ -455,14 +457,31 @@ public class UpgraderTest {
tester.deployCompletely(default1, applicationPackage);
tester.deployCompletely(default2, applicationPackage);
tester.deployCompletely(default3, applicationPackage);
+
tester.updateVersionStatus(version);
assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
+ tester.upgrader().maintain();
+ assertEquals("Upgrade scheduled for previously failing apps", 4, tester.buildSystem().jobs().size());
+ tester.completeUpgrade(default0, version, "default");
+ tester.completeUpgrade(default1, version, "default");
+ tester.completeUpgrade(default2, version, "default");
+ tester.completeUpgrade(default3, version, "default");
+
+ assertEquals(version, tester.application(default0.id()).deployedVersion().get());
+ assertEquals(version, tester.application(default1.id()).deployedVersion().get());
+ assertEquals(version, tester.application(default2.id()).deployedVersion().get());
+ assertEquals(version, tester.application(default3.id()).deployedVersion().get());
+
// Over 12 hours pass and upgrade is rescheduled for 5th app
+ assertEquals(0, tester.buildSystem().jobs().size());
tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1)));
tester.upgrader().maintain();
+ assertEquals(1, tester.buildSystem().jobs().size());
assertEquals("Upgrade is rescheduled", DeploymentJobs.JobType.systemTest.id(),
tester.buildSystem().jobs().get(0).jobName());
+ tester.deployCompletely(default4, applicationPackage);
+ assertEquals(version, tester.application(default4.id()).deployedVersion().get());
}
@Test