diff options
author | Jon Marius Venstad <venstad@gmail.com> | 2021-08-19 15:01:58 +0200 |
---|---|---|
committer | Jon Marius Venstad <venstad@gmail.com> | 2021-08-19 15:01:58 +0200 |
commit | fa9cb345a17bcedfae450022d50cef687e4482c4 (patch) | |
tree | 58c5226c15124295357b6fe811ca9fb23416762c /controller-server | |
parent | a36f3ee1bedc1917e8ff1aeddcaa3b81f6dfe54e (diff) |
Abort running jobs less eagerly
Diffstat (limited to 'controller-server')
7 files changed, 49 insertions, 46 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 30e731853f2..2dbf910b49e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -105,16 +105,6 @@ public class DeploymentTrigger { instance = instance.withChange(instance.change().with(outstanding.application().get())); return instance.withChange(remainingChange(instance, status)); }); - - // Abort irrelevant, running jobs to get new application out faster. - Map<JobId, List<Versions>> newJobsToRun = jobs.deploymentStatus(application.get()).jobsToRun(); - for (Run run : jobs.active(application.get().id().instance(instanceName))) { - if ( ! run.id().type().environment().isManuallyDeployed() - && newJobsToRun.getOrDefault(run.id().job(), List.of()).stream() - .noneMatch(versions -> versions.targetsMatch(run.versions()) - && versions.sourcesMatchIfPresent(run.versions()))) - jobs.abort(run.id()); - } } } applications().store(application); @@ -303,24 +293,22 @@ public class DeploymentTrigger { .collect(toList()); } - /** - * Finds the next step to trigger for the given application, if any, and returns these as a list. - */ + /** Finds the next step to trigger for the given application, if any, and returns these as a list. */ private List<Job> computeReadyJobs(DeploymentStatus status) { List<Job> jobs = new ArrayList<>(); status.jobsToRun().forEach((job, versionsList) -> { - for (Versions versions : versionsList) - status.jobSteps().get(job).readyAt(status.application().require(job.application().instance()).change()) - .filter(readyAt -> ! clock.instant().isBefore(readyAt)) - .filter(__ -> ! status.jobs().get(job).get().isRunning()) - .filter(__ -> ! (job.type().isProduction() && isUnhealthyInAnotherZone(status.application(), job))) - .ifPresent(readyAt -> { - jobs.add(deploymentJob(status.application().require(job.application().instance()), - versions, - job.type(), - status.instanceJobs(job.application().instance()).get(job.type()), - readyAt)); - }); + for (Versions versions : versionsList) + status.jobSteps().get(job).readyAt(status.application().require(job.application().instance()).change()) + .filter(readyAt -> ! clock.instant().isBefore(readyAt)) + .filter(__ -> ! (job.type().isProduction() && isUnhealthyInAnotherZone(status.application(), job))) + .filter(__ -> abortIfRunning(versionsList, status.jobs().get(job).get())) // Abort and trigger this later if running with outdated parameters. + .ifPresent(readyAt -> { + jobs.add(deploymentJob(status.application().require(job.application().instance()), + versions, + job.type(), + status.instanceJobs(job.application().instance()).get(job.type()), + readyAt)); + }); }); return Collections.unmodifiableList(jobs); } @@ -335,6 +323,19 @@ public class DeploymentTrigger { return false; } + /** Returns whether the job is not running, and also aborts it if it's running with outdated versions. */ + private boolean abortIfRunning(List<Versions> versionsList, JobStatus status) { + if ( ! status.isRunning()) + return true; + + Run last = status.lastTriggered().get(); + if (versionsList.stream().noneMatch(versions -> versions.targetsMatch(last.versions()) + && versions.sourcesMatchIfPresent(last.versions()))) + controller.jobController().abort(last.id()); + + return false; + } + // ---------- Change management o_O ---------- private boolean acceptNewApplicationVersion(DeploymentStatus status, InstanceName instance) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java index e52d1900a9d..86b9370150c 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java @@ -114,7 +114,7 @@ public class ControllerTest { context.runJob(stagingTest); // production job succeeding now - context.jobAborted(productionUsWest1); + context.triggerJobs().jobAborted(productionUsWest1); context.runJob(productionUsWest1); // causes triggering of next production job diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 70a967ecef9..d4c9425fc03 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -127,7 +127,7 @@ public class DeploymentTriggerTest { assertEquals(upgrade.with(app.lastSubmission().get()), app.instance().change()); } - @Test + @Test public void abortsJobsOnNewApplicationChange() { var app = tester.newDeploymentContext(); app.submit() @@ -141,15 +141,15 @@ public class DeploymentTriggerTest { app.submit(); assertTrue(tester.jobs().active(id).isPresent()); + tester.triggerJobs(); tester.runner().run(); - assertFalse(tester.jobs().active(id).isPresent()); + assertTrue(tester.jobs().active(id).isPresent()); // old run + app.runJob(systemTest).runJob(stagingTest).runJob(stagingTest); // outdated run is aborted when otherwise blocking a new run tester.triggerJobs(); - assertEquals(EnumSet.of(systemTest, stagingTest), tester.jobs().active().stream() - .map(run -> run.id().type()) - .collect(Collectors.toCollection(() -> EnumSet.noneOf(JobType.class)))); + app.jobAborted(productionUsCentral1); - app.deploy(); + app.runJob(productionUsCentral1).runJob(productionUsWest1).runJob(productionUsEast3); assertEquals(Change.empty(), app.instance().change()); tester.controllerTester().upgradeSystem(new Version("8.9")); @@ -159,6 +159,8 @@ public class DeploymentTriggerTest { // Jobs are not aborted when the new submission remains outstanding. app.submit(); + app.runJob(systemTest).runJob(stagingTest); + tester.triggerJobs(); tester.runner().run(); assertEquals(EnumSet.of(productionUsCentral1), tester.jobs().active().stream() .map(run -> run.id().type()) @@ -489,8 +491,9 @@ public class DeploymentTriggerTest { Version version2 = Version.fromString("7.2"); tester.controllerTester().upgradeSystem(version2); tester.upgrader().maintain(); - app1.runJob(systemTest).runJob(stagingTest) // tests for previous version — these are "reused" later. - .runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1); + tester.triggerJobs(); + app1.jobAborted(systemTest).jobAborted(stagingTest); + app1.runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1); assertEquals(version2, app1.deployment(productionUsCentral1.zone(main)).version()); Instant triggered = app1.instanceJobs().get(productionUsCentral1).lastTriggered().get().start(); tester.clock().advance(Duration.ofHours(1)); @@ -502,7 +505,8 @@ public class DeploymentTriggerTest { assertEquals("Change becomes latest non-broken version", Change.of(version1), app1.instance().change()); // version1 proceeds 'til the last job, where it fails; us-central-1 is skipped, as current change is strictly dominated by what's deployed there. - app1.failDeployment(productionEuWest1); + app1.runJob(systemTest).runJob(stagingTest) + .failDeployment(productionEuWest1); assertEquals(triggered, app1.instanceJobs().get(productionUsCentral1).lastTriggered().get().start()); // Roll out a new application version, which gives a dual change -- this should trigger us-central-1, but only as long as it hasn't yet deployed there. @@ -569,7 +573,7 @@ public class DeploymentTriggerTest { app.runJob(systemTest).runJob(stagingTest); // Finish old run of the aborted production job. - app.jobAborted(productionUsEast3); + app.triggerJobs().jobAborted(productionUsEast3); // New upgrade is already tested for both jobs. @@ -722,16 +726,15 @@ public class DeploymentTriggerTest { // Finish deployment for apps 2 and 3, then release a new version, leaving only app1 with an application upgrade. app2.deploy(); app3.deploy(); - app1.assertRunning(stagingTest); - assertEquals(1, tester.jobs().active().size()); + app1.runJob(stagingTest); + assertEquals(0, tester.jobs().active().size()); tester.controllerTester().upgradeSystem(new Version("6.2")); tester.upgrader().maintain(); app1.submit(applicationPackage); - app1.jobAborted(stagingTest); // Tests for app1 trigger before the others since it carries an application upgrade. - tester.readyJobsTrigger().maintain(); + tester.readyJobsTrigger().run(); app1.assertRunning(systemTest); app1.assertRunning(stagingTest); assertEquals(2, tester.jobs().active().size()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 326f4bf311e..91b5dc232e5 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -167,7 +167,7 @@ public class UpgraderTest { // --- Failing application is repaired by changing the application, causing confidence to move above 'high' threshold // Deploy application change default0.submit(applicationPackage("default")); - default0.jobAborted(stagingTest); + default0.triggerJobs().jobAborted(stagingTest); default0.deploy(); tester.controllerTester().computeVersionStatus(); @@ -232,7 +232,7 @@ public class UpgraderTest { // State: Default applications started upgrading to version5 tester.clock().advance(Duration.ofHours(1)); tester.upgrader().maintain(); - default3.failDeployment(stagingTest); + default3.triggerJobs().jobAborted(stagingTest); default0.runJob(systemTest) .failDeployment(stagingTest); default1.runJob(systemTest) diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java index ebfd78c2764..8a182ce5eca 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java @@ -84,7 +84,6 @@ import java.io.File; import java.math.BigInteger; import java.net.URI; import java.security.cert.X509Certificate; -import java.time.Duration; import java.time.Instant; import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json index 55be0881ec2..2813bd0ab7d 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json @@ -413,7 +413,7 @@ "id": 1, "url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/production-us-west-1/run/1", "start": "(ignore)", - "status": "aborted", + "status": "running", "versions": { "targetPlatform": "6.1.0", "targetApplication": { @@ -469,7 +469,7 @@ "id": 2, "url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/production-us-east-3/run/2", "start": "(ignore)", - "status": "aborted", + "status": "running", "versions": { "targetPlatform": "6.1.0", "targetApplication": { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java index 03002aee955..e3fef9f9066 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java @@ -304,7 +304,7 @@ public class VersionStatusTest { Confidence.low, confidence(tester.controller(), version2)); // Remaining canary upgrades to version2 which raises confidence to normal and more apps upgrade - canary2.failDeployment(systemTest); + canary2.triggerJobs().jobAborted(systemTest).jobAborted(stagingTest); canary2.runJob(stagingTest); canary2.deployPlatform(version2); tester.controllerTester().computeVersionStatus(); |