aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorJon Marius Venstad <jonmv@users.noreply.github.com>2021-08-20 07:48:57 +0200
committerGitHub <noreply@github.com>2021-08-20 07:48:57 +0200
commit73c26cc07d70eb1f84837bffbda97e1d1356869c (patch)
treed985078bcec60d1c958d74d58665d94448053d8f /controller-server
parentfcf90f90b175d568082aef28659463fae5a62ad1 (diff)
parent290dc43262a9f55589d982e066d1f5b701584925 (diff)
Merge pull request #18801 from vespa-engine/jonmv/configurable-upgrade-rollout
Jonmv/configurable upgrade rollout
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java116
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java2
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java15
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java50
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java4
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java1
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json4
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java2
8 files changed, 78 insertions, 116 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index 4e6df1921b6..2dbf910b49e 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -105,16 +105,6 @@ public class DeploymentTrigger {
instance = instance.withChange(instance.change().with(outstanding.application().get()));
return instance.withChange(remainingChange(instance, status));
});
-
- // Abort irrelevant, running jobs to get new application out faster.
- Map<JobId, List<Versions>> newJobsToRun = jobs.deploymentStatus(application.get()).jobsToRun();
- for (Run run : jobs.active(application.get().id().instance(instanceName))) {
- if ( ! run.id().type().environment().isManuallyDeployed()
- && newJobsToRun.getOrDefault(run.id().job(), List.of()).stream()
- .noneMatch(versions -> versions.targetsMatch(run.versions())
- && versions.sourcesMatchIfPresent(run.versions())))
- jobs.abort(run.id());
- }
}
}
applications().store(application);
@@ -289,10 +279,6 @@ public class DeploymentTrigger {
return controller.applications();
}
- private Optional<Deployment> deploymentFor(Instance instance, JobType jobType) {
- return Optional.ofNullable(instance.deployments().get(jobType.zone(controller.system())));
- }
-
// ---------- Ready job computation ----------
/** Returns the set of all jobs which have changes to propagate from the upstream steps. */
@@ -307,24 +293,22 @@ public class DeploymentTrigger {
.collect(toList());
}
- /**
- * Finds the next step to trigger for the given application, if any, and returns these as a list.
- */
+ /** Finds the next step to trigger for the given application, if any, and returns these as a list. */
private List<Job> computeReadyJobs(DeploymentStatus status) {
List<Job> jobs = new ArrayList<>();
status.jobsToRun().forEach((job, versionsList) -> {
- for (Versions versions : versionsList)
- status.jobSteps().get(job).readyAt(status.application().require(job.application().instance()).change())
- .filter(readyAt -> ! clock.instant().isBefore(readyAt))
- .filter(__ -> ! status.jobs().get(job).get().isRunning())
- .filter(__ -> ! (job.type().isProduction() && isUnhealthyInAnotherZone(status.application(), job)))
- .ifPresent(readyAt -> {
- jobs.add(deploymentJob(status.application().require(job.application().instance()),
- versions,
- job.type(),
- status.instanceJobs(job.application().instance()).get(job.type()),
- readyAt));
- });
+ for (Versions versions : versionsList)
+ status.jobSteps().get(job).readyAt(status.application().require(job.application().instance()).change())
+ .filter(readyAt -> ! clock.instant().isBefore(readyAt))
+ .filter(__ -> ! (job.type().isProduction() && isUnhealthyInAnotherZone(status.application(), job)))
+ .filter(__ -> abortIfRunning(versionsList, status.jobs().get(job).get())) // Abort and trigger this later if running with outdated parameters.
+ .ifPresent(readyAt -> {
+ jobs.add(deploymentJob(status.application().require(job.application().instance()),
+ versions,
+ job.type(),
+ status.instanceJobs(job.application().instance()).get(job.type()),
+ readyAt));
+ });
});
return Collections.unmodifiableList(jobs);
}
@@ -339,71 +323,17 @@ public class DeploymentTrigger {
return false;
}
- /** Returns whether the given job can trigger at the given instant */
- public boolean triggerAt(Instant instant, JobType job, JobStatus jobStatus, Versions versions, Instance instance, DeploymentSpec deploymentSpec) {
- if (instance.jobPause(job).map(until -> until.isAfter(clock.instant())).orElse(false)) return false;
- if (jobStatus.lastTriggered().isEmpty()) return true;
- if (jobStatus.isSuccess()) return true; // Success
- if (jobStatus.lastCompleted().isEmpty()) return true; // Never completed
- if (jobStatus.firstFailing().isEmpty()) return true; // Should not happen as firstFailing should be set for an unsuccessful job
- if ( ! versions.targetsMatch(jobStatus.lastCompleted().get().versions())) return true; // Always trigger as targets have changed
- if (deploymentSpec.requireInstance(instance.name()).upgradePolicy() == DeploymentSpec.UpgradePolicy.canary) return true; // Don't throttle canaries
-
- Instant firstFailing = jobStatus.firstFailing().get().end().get();
- Instant lastCompleted = jobStatus.lastCompleted().get().end().get();
-
- // Retry all errors immediately for 1 minute
- if (firstFailing.isAfter(instant.minus(Duration.ofMinutes(1)))) return true;
-
- // Retry out of capacity errors in test environments every minute
- if (job.environment().isTest() && jobStatus.isOutOfCapacity()) {
- return lastCompleted.isBefore(instant.minus(Duration.ofMinutes(1)));
- }
-
- // Retry other errors
- if (firstFailing.isAfter(instant.minus(Duration.ofHours(1)))) { // If we failed within the last hour ...
- return lastCompleted.isBefore(instant.minus(Duration.ofMinutes(10))); // ... retry every 10 minutes
- }
- return lastCompleted.isBefore(instant.minus(Duration.ofHours(2))); // Retry at most every 2 hours
- }
-
- // ---------- Completion logic ----------
+ /** Returns whether the job is not running, and also aborts it if it's running with outdated versions. */
+ private boolean abortIfRunning(List<Versions> versionsList, JobStatus status) {
+ if ( ! status.isRunning())
+ return true;
- /**
- * Returns whether the given change is complete for the given application for the given job.
- *
- * Any job is complete if the given change is already successful on that job.
- * A production job is also considered complete if its current change is strictly dominated by what
- * is already deployed in its zone, i.e., no parts of the change are upgrades, and the full current
- * change for the application downgrades the deployment, which is an acknowledgement that the deployed
- * version is broken somehow, such that the job may be locked in failure until a new version is released.
- *
- * Additionally, if the application is pinned to a Vespa version, and the given change has a (this) platform,
- * the deployment for the job must be on the pinned version.
- */
- public boolean isComplete(Change change, Change fullChange, Instance instance, JobType jobType,
- JobStatus status) {
- Optional<Deployment> existingDeployment = deploymentFor(instance, jobType);
- if ( change.isPinned()
- && change.platform().isPresent()
- && ! existingDeployment.map(Deployment::version).equals(change.platform()))
- return false;
-
- return status.lastSuccess()
- .map(run -> change.platform().map(run.versions().targetPlatform()::equals).orElse(true)
- && change.application().map(run.versions().targetApplication()::equals).orElse(true))
- .orElse(false)
- || jobType.isProduction()
- && existingDeployment.map(deployment -> ! isUpgrade(change, deployment) && isDowngrade(fullChange, deployment))
- .orElse(false);
- }
-
- private static boolean isUpgrade(Change change, Deployment deployment) {
- return change.upgrades(deployment.version()) || change.upgrades(deployment.applicationVersion());
- }
+ Run last = status.lastTriggered().get();
+ if (versionsList.stream().noneMatch(versions -> versions.targetsMatch(last.versions())
+ && versions.sourcesMatchIfPresent(last.versions())))
+ controller.jobController().abort(last.id());
- private static boolean isDowngrade(Change change, Deployment deployment) {
- return change.downgrades(deployment.version()) || change.downgrades(deployment.applicationVersion());
+ return false;
}
// ---------- Change management o_O ----------
@@ -411,6 +341,8 @@ public class DeploymentTrigger {
private boolean acceptNewApplicationVersion(DeploymentStatus status, InstanceName instance) {
if (status.application().require(instance).change().application().isPresent()) return true; // Replacing a previous application change is ok.
if (status.hasFailures()) return true; // Allow changes to fix upgrade problems.
+ if (status.application().deploymentSpec().instance(instance) // Leading upgrade allows app change to join in.
+ .map(spec -> spec.upgradeRollout() == DeploymentSpec.UpgradeRollout.leading).orElse(false)) return true;
return status.application().require(instance).change().platform().isEmpty();
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java
index e52d1900a9d..86b9370150c 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java
@@ -114,7 +114,7 @@ public class ControllerTest {
context.runJob(stagingTest);
// production job succeeding now
- context.jobAborted(productionUsWest1);
+ context.triggerJobs().jobAborted(productionUsWest1);
context.runJob(productionUsWest1);
// causes triggering of next production job
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java
index 808409cf793..b234ab4960b 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java
@@ -53,6 +53,7 @@ public class ApplicationPackageBuilder {
private OptionalInt majorVersion = OptionalInt.empty();
private String instances = "default";
private String upgradePolicy = null;
+ private String upgradeRollout = null;
private String globalServiceId = null;
private String athenzIdentityAttributes = null;
private String searchDefinition = "search test { }";
@@ -75,6 +76,11 @@ public class ApplicationPackageBuilder {
return this;
}
+ public ApplicationPackageBuilder upgradeRollout(String upgradeRollout) {
+ this.upgradeRollout = upgradeRollout;
+ return this;
+ }
+
public ApplicationPackageBuilder globalServiceId(String globalServiceId) {
this.globalServiceId = globalServiceId;
return this;
@@ -221,10 +227,11 @@ public class ApplicationPackageBuilder {
}
xml.append(">\n");
xml.append(" <instance id='").append(instances).append("'>\n");
- if (upgradePolicy != null) {
- xml.append(" <upgrade policy='");
- xml.append(upgradePolicy);
- xml.append("'/>\n");
+ if (upgradePolicy != null || upgradeRollout != null) {
+ xml.append(" <upgrade ");
+ if (upgradePolicy != null) xml.append("policy='").append(upgradePolicy).append("' ");
+ if (upgradeRollout != null) xml.append("rollout='").append(upgradeRollout).append("' ");
+ xml.append("/>\n");
}
xml.append(notifications);
if (explicitSystemTest)
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
index 7077e14a648..d4c9425fc03 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
@@ -107,6 +107,27 @@ public class DeploymentTriggerTest {
}
@Test
+ public void leadingUpgradeAllowsApplicationChangeWhileUpgrading() {
+ var applicationPackage = new ApplicationPackageBuilder().region("us-east-3")
+ .upgradeRollout("leading")
+ .build();
+ var app = tester.newDeploymentContext();
+
+ app.submit(applicationPackage).deploy();
+
+ Change upgrade = Change.of(new Version("7.8.9"));
+ tester.controllerTester().upgradeSystem(upgrade.platform().get());
+ tester.upgrader().maintain();
+ app.runJob(systemTest).runJob(stagingTest);
+ tester.triggerJobs();
+ app.assertRunning(productionUsEast3);
+ assertEquals(upgrade, app.instance().change());
+
+ app.submit(applicationPackage);
+ assertEquals(upgrade.with(app.lastSubmission().get()), app.instance().change());
+ }
+
+ @Test
public void abortsJobsOnNewApplicationChange() {
var app = tester.newDeploymentContext();
app.submit()
@@ -120,15 +141,15 @@ public class DeploymentTriggerTest {
app.submit();
assertTrue(tester.jobs().active(id).isPresent());
+ tester.triggerJobs();
tester.runner().run();
- assertFalse(tester.jobs().active(id).isPresent());
+ assertTrue(tester.jobs().active(id).isPresent()); // old run
+ app.runJob(systemTest).runJob(stagingTest).runJob(stagingTest); // outdated run is aborted when otherwise blocking a new run
tester.triggerJobs();
- assertEquals(EnumSet.of(systemTest, stagingTest), tester.jobs().active().stream()
- .map(run -> run.id().type())
- .collect(Collectors.toCollection(() -> EnumSet.noneOf(JobType.class))));
+ app.jobAborted(productionUsCentral1);
- app.deploy();
+ app.runJob(productionUsCentral1).runJob(productionUsWest1).runJob(productionUsEast3);
assertEquals(Change.empty(), app.instance().change());
tester.controllerTester().upgradeSystem(new Version("8.9"));
@@ -138,6 +159,8 @@ public class DeploymentTriggerTest {
// Jobs are not aborted when the new submission remains outstanding.
app.submit();
+ app.runJob(systemTest).runJob(stagingTest);
+ tester.triggerJobs();
tester.runner().run();
assertEquals(EnumSet.of(productionUsCentral1), tester.jobs().active().stream()
.map(run -> run.id().type())
@@ -468,8 +491,9 @@ public class DeploymentTriggerTest {
Version version2 = Version.fromString("7.2");
tester.controllerTester().upgradeSystem(version2);
tester.upgrader().maintain();
- app1.runJob(systemTest).runJob(stagingTest) // tests for previous version — these are "reused" later.
- .runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1);
+ tester.triggerJobs();
+ app1.jobAborted(systemTest).jobAborted(stagingTest);
+ app1.runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1);
assertEquals(version2, app1.deployment(productionUsCentral1.zone(main)).version());
Instant triggered = app1.instanceJobs().get(productionUsCentral1).lastTriggered().get().start();
tester.clock().advance(Duration.ofHours(1));
@@ -481,7 +505,8 @@ public class DeploymentTriggerTest {
assertEquals("Change becomes latest non-broken version", Change.of(version1), app1.instance().change());
// version1 proceeds 'til the last job, where it fails; us-central-1 is skipped, as current change is strictly dominated by what's deployed there.
- app1.failDeployment(productionEuWest1);
+ app1.runJob(systemTest).runJob(stagingTest)
+ .failDeployment(productionEuWest1);
assertEquals(triggered, app1.instanceJobs().get(productionUsCentral1).lastTriggered().get().start());
// Roll out a new application version, which gives a dual change -- this should trigger us-central-1, but only as long as it hasn't yet deployed there.
@@ -548,7 +573,7 @@ public class DeploymentTriggerTest {
app.runJob(systemTest).runJob(stagingTest);
// Finish old run of the aborted production job.
- app.jobAborted(productionUsEast3);
+ app.triggerJobs().jobAborted(productionUsEast3);
// New upgrade is already tested for both jobs.
@@ -701,16 +726,15 @@ public class DeploymentTriggerTest {
// Finish deployment for apps 2 and 3, then release a new version, leaving only app1 with an application upgrade.
app2.deploy();
app3.deploy();
- app1.assertRunning(stagingTest);
- assertEquals(1, tester.jobs().active().size());
+ app1.runJob(stagingTest);
+ assertEquals(0, tester.jobs().active().size());
tester.controllerTester().upgradeSystem(new Version("6.2"));
tester.upgrader().maintain();
app1.submit(applicationPackage);
- app1.jobAborted(stagingTest);
// Tests for app1 trigger before the others since it carries an application upgrade.
- tester.readyJobsTrigger().maintain();
+ tester.readyJobsTrigger().run();
app1.assertRunning(systemTest);
app1.assertRunning(stagingTest);
assertEquals(2, tester.jobs().active().size());
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
index 326f4bf311e..91b5dc232e5 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
@@ -167,7 +167,7 @@ public class UpgraderTest {
// --- Failing application is repaired by changing the application, causing confidence to move above 'high' threshold
// Deploy application change
default0.submit(applicationPackage("default"));
- default0.jobAborted(stagingTest);
+ default0.triggerJobs().jobAborted(stagingTest);
default0.deploy();
tester.controllerTester().computeVersionStatus();
@@ -232,7 +232,7 @@ public class UpgraderTest {
// State: Default applications started upgrading to version5
tester.clock().advance(Duration.ofHours(1));
tester.upgrader().maintain();
- default3.failDeployment(stagingTest);
+ default3.triggerJobs().jobAborted(stagingTest);
default0.runJob(systemTest)
.failDeployment(stagingTest);
default1.runJob(systemTest)
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java
index ebfd78c2764..8a182ce5eca 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java
@@ -84,7 +84,6 @@ import java.io.File;
import java.math.BigInteger;
import java.net.URI;
import java.security.cert.X509Certificate;
-import java.time.Duration;
import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json
index 55be0881ec2..2813bd0ab7d 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json
@@ -413,7 +413,7 @@
"id": 1,
"url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/production-us-west-1/run/1",
"start": "(ignore)",
- "status": "aborted",
+ "status": "running",
"versions": {
"targetPlatform": "6.1.0",
"targetApplication": {
@@ -469,7 +469,7 @@
"id": 2,
"url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/production-us-east-3/run/2",
"start": "(ignore)",
- "status": "aborted",
+ "status": "running",
"versions": {
"targetPlatform": "6.1.0",
"targetApplication": {
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java
index 03002aee955..e3fef9f9066 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java
@@ -304,7 +304,7 @@ public class VersionStatusTest {
Confidence.low, confidence(tester.controller(), version2));
// Remaining canary upgrades to version2 which raises confidence to normal and more apps upgrade
- canary2.failDeployment(systemTest);
+ canary2.triggerJobs().jobAborted(systemTest).jobAborted(stagingTest);
canary2.runJob(stagingTest);
canary2.deployPlatform(version2);
tester.controllerTester().computeVersionStatus();