diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2021-08-20 07:48:57 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-08-20 07:48:57 +0200 |
commit | 73c26cc07d70eb1f84837bffbda97e1d1356869c (patch) | |
tree | d985078bcec60d1c958d74d58665d94448053d8f /controller-server | |
parent | fcf90f90b175d568082aef28659463fae5a62ad1 (diff) | |
parent | 290dc43262a9f55589d982e066d1f5b701584925 (diff) |
Merge pull request #18801 from vespa-engine/jonmv/configurable-upgrade-rollout
Jonmv/configurable upgrade rollout
Diffstat (limited to 'controller-server')
8 files changed, 78 insertions, 116 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 4e6df1921b6..2dbf910b49e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -105,16 +105,6 @@ public class DeploymentTrigger { instance = instance.withChange(instance.change().with(outstanding.application().get())); return instance.withChange(remainingChange(instance, status)); }); - - // Abort irrelevant, running jobs to get new application out faster. - Map<JobId, List<Versions>> newJobsToRun = jobs.deploymentStatus(application.get()).jobsToRun(); - for (Run run : jobs.active(application.get().id().instance(instanceName))) { - if ( ! run.id().type().environment().isManuallyDeployed() - && newJobsToRun.getOrDefault(run.id().job(), List.of()).stream() - .noneMatch(versions -> versions.targetsMatch(run.versions()) - && versions.sourcesMatchIfPresent(run.versions()))) - jobs.abort(run.id()); - } } } applications().store(application); @@ -289,10 +279,6 @@ public class DeploymentTrigger { return controller.applications(); } - private Optional<Deployment> deploymentFor(Instance instance, JobType jobType) { - return Optional.ofNullable(instance.deployments().get(jobType.zone(controller.system()))); - } - // ---------- Ready job computation ---------- /** Returns the set of all jobs which have changes to propagate from the upstream steps. */ @@ -307,24 +293,22 @@ public class DeploymentTrigger { .collect(toList()); } - /** - * Finds the next step to trigger for the given application, if any, and returns these as a list. - */ + /** Finds the next step to trigger for the given application, if any, and returns these as a list. */ private List<Job> computeReadyJobs(DeploymentStatus status) { List<Job> jobs = new ArrayList<>(); status.jobsToRun().forEach((job, versionsList) -> { - for (Versions versions : versionsList) - status.jobSteps().get(job).readyAt(status.application().require(job.application().instance()).change()) - .filter(readyAt -> ! clock.instant().isBefore(readyAt)) - .filter(__ -> ! status.jobs().get(job).get().isRunning()) - .filter(__ -> ! (job.type().isProduction() && isUnhealthyInAnotherZone(status.application(), job))) - .ifPresent(readyAt -> { - jobs.add(deploymentJob(status.application().require(job.application().instance()), - versions, - job.type(), - status.instanceJobs(job.application().instance()).get(job.type()), - readyAt)); - }); + for (Versions versions : versionsList) + status.jobSteps().get(job).readyAt(status.application().require(job.application().instance()).change()) + .filter(readyAt -> ! clock.instant().isBefore(readyAt)) + .filter(__ -> ! (job.type().isProduction() && isUnhealthyInAnotherZone(status.application(), job))) + .filter(__ -> abortIfRunning(versionsList, status.jobs().get(job).get())) // Abort and trigger this later if running with outdated parameters. + .ifPresent(readyAt -> { + jobs.add(deploymentJob(status.application().require(job.application().instance()), + versions, + job.type(), + status.instanceJobs(job.application().instance()).get(job.type()), + readyAt)); + }); }); return Collections.unmodifiableList(jobs); } @@ -339,71 +323,17 @@ public class DeploymentTrigger { return false; } - /** Returns whether the given job can trigger at the given instant */ - public boolean triggerAt(Instant instant, JobType job, JobStatus jobStatus, Versions versions, Instance instance, DeploymentSpec deploymentSpec) { - if (instance.jobPause(job).map(until -> until.isAfter(clock.instant())).orElse(false)) return false; - if (jobStatus.lastTriggered().isEmpty()) return true; - if (jobStatus.isSuccess()) return true; // Success - if (jobStatus.lastCompleted().isEmpty()) return true; // Never completed - if (jobStatus.firstFailing().isEmpty()) return true; // Should not happen as firstFailing should be set for an unsuccessful job - if ( ! versions.targetsMatch(jobStatus.lastCompleted().get().versions())) return true; // Always trigger as targets have changed - if (deploymentSpec.requireInstance(instance.name()).upgradePolicy() == DeploymentSpec.UpgradePolicy.canary) return true; // Don't throttle canaries - - Instant firstFailing = jobStatus.firstFailing().get().end().get(); - Instant lastCompleted = jobStatus.lastCompleted().get().end().get(); - - // Retry all errors immediately for 1 minute - if (firstFailing.isAfter(instant.minus(Duration.ofMinutes(1)))) return true; - - // Retry out of capacity errors in test environments every minute - if (job.environment().isTest() && jobStatus.isOutOfCapacity()) { - return lastCompleted.isBefore(instant.minus(Duration.ofMinutes(1))); - } - - // Retry other errors - if (firstFailing.isAfter(instant.minus(Duration.ofHours(1)))) { // If we failed within the last hour ... - return lastCompleted.isBefore(instant.minus(Duration.ofMinutes(10))); // ... retry every 10 minutes - } - return lastCompleted.isBefore(instant.minus(Duration.ofHours(2))); // Retry at most every 2 hours - } - - // ---------- Completion logic ---------- + /** Returns whether the job is not running, and also aborts it if it's running with outdated versions. */ + private boolean abortIfRunning(List<Versions> versionsList, JobStatus status) { + if ( ! status.isRunning()) + return true; - /** - * Returns whether the given change is complete for the given application for the given job. - * - * Any job is complete if the given change is already successful on that job. - * A production job is also considered complete if its current change is strictly dominated by what - * is already deployed in its zone, i.e., no parts of the change are upgrades, and the full current - * change for the application downgrades the deployment, which is an acknowledgement that the deployed - * version is broken somehow, such that the job may be locked in failure until a new version is released. - * - * Additionally, if the application is pinned to a Vespa version, and the given change has a (this) platform, - * the deployment for the job must be on the pinned version. - */ - public boolean isComplete(Change change, Change fullChange, Instance instance, JobType jobType, - JobStatus status) { - Optional<Deployment> existingDeployment = deploymentFor(instance, jobType); - if ( change.isPinned() - && change.platform().isPresent() - && ! existingDeployment.map(Deployment::version).equals(change.platform())) - return false; - - return status.lastSuccess() - .map(run -> change.platform().map(run.versions().targetPlatform()::equals).orElse(true) - && change.application().map(run.versions().targetApplication()::equals).orElse(true)) - .orElse(false) - || jobType.isProduction() - && existingDeployment.map(deployment -> ! isUpgrade(change, deployment) && isDowngrade(fullChange, deployment)) - .orElse(false); - } - - private static boolean isUpgrade(Change change, Deployment deployment) { - return change.upgrades(deployment.version()) || change.upgrades(deployment.applicationVersion()); - } + Run last = status.lastTriggered().get(); + if (versionsList.stream().noneMatch(versions -> versions.targetsMatch(last.versions()) + && versions.sourcesMatchIfPresent(last.versions()))) + controller.jobController().abort(last.id()); - private static boolean isDowngrade(Change change, Deployment deployment) { - return change.downgrades(deployment.version()) || change.downgrades(deployment.applicationVersion()); + return false; } // ---------- Change management o_O ---------- @@ -411,6 +341,8 @@ public class DeploymentTrigger { private boolean acceptNewApplicationVersion(DeploymentStatus status, InstanceName instance) { if (status.application().require(instance).change().application().isPresent()) return true; // Replacing a previous application change is ok. if (status.hasFailures()) return true; // Allow changes to fix upgrade problems. + if (status.application().deploymentSpec().instance(instance) // Leading upgrade allows app change to join in. + .map(spec -> spec.upgradeRollout() == DeploymentSpec.UpgradeRollout.leading).orElse(false)) return true; return status.application().require(instance).change().platform().isEmpty(); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java index e52d1900a9d..86b9370150c 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java @@ -114,7 +114,7 @@ public class ControllerTest { context.runJob(stagingTest); // production job succeeding now - context.jobAborted(productionUsWest1); + context.triggerJobs().jobAborted(productionUsWest1); context.runJob(productionUsWest1); // causes triggering of next production job diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java index 808409cf793..b234ab4960b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/ApplicationPackageBuilder.java @@ -53,6 +53,7 @@ public class ApplicationPackageBuilder { private OptionalInt majorVersion = OptionalInt.empty(); private String instances = "default"; private String upgradePolicy = null; + private String upgradeRollout = null; private String globalServiceId = null; private String athenzIdentityAttributes = null; private String searchDefinition = "search test { }"; @@ -75,6 +76,11 @@ public class ApplicationPackageBuilder { return this; } + public ApplicationPackageBuilder upgradeRollout(String upgradeRollout) { + this.upgradeRollout = upgradeRollout; + return this; + } + public ApplicationPackageBuilder globalServiceId(String globalServiceId) { this.globalServiceId = globalServiceId; return this; @@ -221,10 +227,11 @@ public class ApplicationPackageBuilder { } xml.append(">\n"); xml.append(" <instance id='").append(instances).append("'>\n"); - if (upgradePolicy != null) { - xml.append(" <upgrade policy='"); - xml.append(upgradePolicy); - xml.append("'/>\n"); + if (upgradePolicy != null || upgradeRollout != null) { + xml.append(" <upgrade "); + if (upgradePolicy != null) xml.append("policy='").append(upgradePolicy).append("' "); + if (upgradeRollout != null) xml.append("rollout='").append(upgradeRollout).append("' "); + xml.append("/>\n"); } xml.append(notifications); if (explicitSystemTest) diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 7077e14a648..d4c9425fc03 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -107,6 +107,27 @@ public class DeploymentTriggerTest { } @Test + public void leadingUpgradeAllowsApplicationChangeWhileUpgrading() { + var applicationPackage = new ApplicationPackageBuilder().region("us-east-3") + .upgradeRollout("leading") + .build(); + var app = tester.newDeploymentContext(); + + app.submit(applicationPackage).deploy(); + + Change upgrade = Change.of(new Version("7.8.9")); + tester.controllerTester().upgradeSystem(upgrade.platform().get()); + tester.upgrader().maintain(); + app.runJob(systemTest).runJob(stagingTest); + tester.triggerJobs(); + app.assertRunning(productionUsEast3); + assertEquals(upgrade, app.instance().change()); + + app.submit(applicationPackage); + assertEquals(upgrade.with(app.lastSubmission().get()), app.instance().change()); + } + + @Test public void abortsJobsOnNewApplicationChange() { var app = tester.newDeploymentContext(); app.submit() @@ -120,15 +141,15 @@ public class DeploymentTriggerTest { app.submit(); assertTrue(tester.jobs().active(id).isPresent()); + tester.triggerJobs(); tester.runner().run(); - assertFalse(tester.jobs().active(id).isPresent()); + assertTrue(tester.jobs().active(id).isPresent()); // old run + app.runJob(systemTest).runJob(stagingTest).runJob(stagingTest); // outdated run is aborted when otherwise blocking a new run tester.triggerJobs(); - assertEquals(EnumSet.of(systemTest, stagingTest), tester.jobs().active().stream() - .map(run -> run.id().type()) - .collect(Collectors.toCollection(() -> EnumSet.noneOf(JobType.class)))); + app.jobAborted(productionUsCentral1); - app.deploy(); + app.runJob(productionUsCentral1).runJob(productionUsWest1).runJob(productionUsEast3); assertEquals(Change.empty(), app.instance().change()); tester.controllerTester().upgradeSystem(new Version("8.9")); @@ -138,6 +159,8 @@ public class DeploymentTriggerTest { // Jobs are not aborted when the new submission remains outstanding. app.submit(); + app.runJob(systemTest).runJob(stagingTest); + tester.triggerJobs(); tester.runner().run(); assertEquals(EnumSet.of(productionUsCentral1), tester.jobs().active().stream() .map(run -> run.id().type()) @@ -468,8 +491,9 @@ public class DeploymentTriggerTest { Version version2 = Version.fromString("7.2"); tester.controllerTester().upgradeSystem(version2); tester.upgrader().maintain(); - app1.runJob(systemTest).runJob(stagingTest) // tests for previous version — these are "reused" later. - .runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1); + tester.triggerJobs(); + app1.jobAborted(systemTest).jobAborted(stagingTest); + app1.runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1); assertEquals(version2, app1.deployment(productionUsCentral1.zone(main)).version()); Instant triggered = app1.instanceJobs().get(productionUsCentral1).lastTriggered().get().start(); tester.clock().advance(Duration.ofHours(1)); @@ -481,7 +505,8 @@ public class DeploymentTriggerTest { assertEquals("Change becomes latest non-broken version", Change.of(version1), app1.instance().change()); // version1 proceeds 'til the last job, where it fails; us-central-1 is skipped, as current change is strictly dominated by what's deployed there. - app1.failDeployment(productionEuWest1); + app1.runJob(systemTest).runJob(stagingTest) + .failDeployment(productionEuWest1); assertEquals(triggered, app1.instanceJobs().get(productionUsCentral1).lastTriggered().get().start()); // Roll out a new application version, which gives a dual change -- this should trigger us-central-1, but only as long as it hasn't yet deployed there. @@ -548,7 +573,7 @@ public class DeploymentTriggerTest { app.runJob(systemTest).runJob(stagingTest); // Finish old run of the aborted production job. - app.jobAborted(productionUsEast3); + app.triggerJobs().jobAborted(productionUsEast3); // New upgrade is already tested for both jobs. @@ -701,16 +726,15 @@ public class DeploymentTriggerTest { // Finish deployment for apps 2 and 3, then release a new version, leaving only app1 with an application upgrade. app2.deploy(); app3.deploy(); - app1.assertRunning(stagingTest); - assertEquals(1, tester.jobs().active().size()); + app1.runJob(stagingTest); + assertEquals(0, tester.jobs().active().size()); tester.controllerTester().upgradeSystem(new Version("6.2")); tester.upgrader().maintain(); app1.submit(applicationPackage); - app1.jobAborted(stagingTest); // Tests for app1 trigger before the others since it carries an application upgrade. - tester.readyJobsTrigger().maintain(); + tester.readyJobsTrigger().run(); app1.assertRunning(systemTest); app1.assertRunning(stagingTest); assertEquals(2, tester.jobs().active().size()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 326f4bf311e..91b5dc232e5 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -167,7 +167,7 @@ public class UpgraderTest { // --- Failing application is repaired by changing the application, causing confidence to move above 'high' threshold // Deploy application change default0.submit(applicationPackage("default")); - default0.jobAborted(stagingTest); + default0.triggerJobs().jobAborted(stagingTest); default0.deploy(); tester.controllerTester().computeVersionStatus(); @@ -232,7 +232,7 @@ public class UpgraderTest { // State: Default applications started upgrading to version5 tester.clock().advance(Duration.ofHours(1)); tester.upgrader().maintain(); - default3.failDeployment(stagingTest); + default3.triggerJobs().jobAborted(stagingTest); default0.runJob(systemTest) .failDeployment(stagingTest); default1.runJob(systemTest) diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java index ebfd78c2764..8a182ce5eca 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java @@ -84,7 +84,6 @@ import java.io.File; import java.math.BigInteger; import java.net.URI; import java.security.cert.X509Certificate; -import java.time.Duration; import java.time.Instant; import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json index 55be0881ec2..2813bd0ab7d 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json @@ -413,7 +413,7 @@ "id": 1, "url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/production-us-west-1/run/1", "start": "(ignore)", - "status": "aborted", + "status": "running", "versions": { "targetPlatform": "6.1.0", "targetApplication": { @@ -469,7 +469,7 @@ "id": 2, "url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/production-us-east-3/run/2", "start": "(ignore)", - "status": "aborted", + "status": "running", "versions": { "targetPlatform": "6.1.0", "targetApplication": { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java index 03002aee955..e3fef9f9066 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java @@ -304,7 +304,7 @@ public class VersionStatusTest { Confidence.low, confidence(tester.controller(), version2)); // Remaining canary upgrades to version2 which raises confidence to normal and more apps upgrade - canary2.failDeployment(systemTest); + canary2.triggerJobs().jobAborted(systemTest).jobAborted(stagingTest); canary2.runJob(stagingTest); canary2.deployPlatform(version2); tester.controllerTester().computeVersionStatus(); |