aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2022-11-10 11:12:21 +0100
committerjonmv <venstad@gmail.com>2022-11-10 11:12:21 +0100
commit463ad9db000b6383215b060567c68122ac09d88a (patch)
treed3ce4343b52c9324362b220b9b5bb95a62b9f8af /controller-server
parentb26cbb29aec75b76767e6e1631eae2a11f035823 (diff)
Avoid rare deadlock with production test/deployment triggering
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java30
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java59
2 files changed, 76 insertions, 13 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
index e16b1378377..4c314c64f7f 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
@@ -40,6 +40,7 @@ import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
@@ -603,21 +604,24 @@ public class DeploymentStatus {
// the revision is now blocked by waiting for the production test to verify the upgrade.
// In this case we must abandon the production test on the pure upgrade, so the revision can be deployed.
if (platformDeployedAt.isPresent() && revisionDeployedAt.isEmpty()) {
- if (jobSteps.get(deployment).readyAt(change, Optional.of(deployment))
- .map(ready -> ! now.isBefore(ready)).orElse(false)) {
- switch (rollout) {
- // If separate rollout, this test should keep blocking the revision, unless there are failures.
- case separate: return hasFailures(jobSteps.get(deployment), jobSteps.get(job)) ? List.of(change) : List.of(change.withoutApplication(), change);
- // If leading rollout, this test should now expect the two changes to fuse and roll together.
- case leading: return List.of(change);
- // If simultaneous rollout, this test should now expect the revision to run ahead.
- case simultaneous: return List.of(change.withoutPlatform(), change);
- }
- }
+ if (jobSteps.get(deployment).readyAt(change, Optional.of(deployment))
+ .map(ready -> ! now.isBefore(ready)).orElse(false)) {
+ return switch (rollout) {
+ // If separate rollout, this test should keep blocking the revision, unless there are failures.
+ case separate -> hasFailures(jobSteps.get(deployment), jobSteps.get(job)) ? List.of(change) : List.of(change.withoutApplication(), change);
+ // If leading rollout, this test should now expect the two changes to fuse and roll together.
+ case leading -> List.of(change);
+ // If simultaneous rollout, this test should now expect the revision to run ahead.
+ case simultaneous -> List.of(change.withoutPlatform(), change);
+ };
+ }
return List.of(change.withoutApplication(), change);
}
- // If neither is deployed, then neither is ready, and we guess like for deployments.
- // If both are deployed, then we need to follow normal logic for whatever is ready.
+ // If neither is deployed, then neither is ready, and we assume the same order of changes as for the deployment job.
+ if (platformDeployedAt.isEmpty())
+ return changes(deployment, jobSteps.get(deployment), change);
+
+ // If both are deployed, then we need to follow normal logic for what is ready.
}
Optional<Instant> platformReadyAt = step.dependenciesCompletedAt(change.withoutApplication(), Optional.of(job));
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
index ea9cf0385d9..9fdecfa625e 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
@@ -2787,4 +2787,63 @@ public class DeploymentTriggerTest {
assertThrows(IllegalStateException.class, JobType.fromJobName("staging-test", zones)::zone);
}
+ @Test
+ void test() {
+ String deploymentXml = """
+ <deployment version="1.0">
+ <test/>
+ <staging/>
+ <block-change days="fri" hours="0-23" time-zone="UTC" />
+ <prod>
+ <region>us-east-3</region>
+ <delay hours="1"/>
+ <test>us-east-3</test>
+ <region>us-west-1</region>
+ </prod>
+ </deployment>""";
+
+ // TODO jonmv: recreate problem where revision starts, then upgrade, while prod is blocked,
+ // then both are tested as separate upgrades, but prod-test has them reversed.
+
+ Version version1 = new Version("7.1");
+ tester.controllerTester().upgradeSystem(version1);
+ ApplicationPackage applicationPackage = ApplicationPackageBuilder.fromDeploymentXml(deploymentXml);
+ tester.clock().setInstant(Instant.EPOCH.plusSeconds(8 * 60 * 60)); // Thursday morning.
+ DeploymentContext app = tester.newDeploymentContext().submit(applicationPackage);
+ RevisionId revision1 = app.lastSubmission().get();
+ app.runJob(systemTest).runJob(stagingTest).runJob(productionUsEast3);
+ tester.clock().advance(Duration.ofHours(1));
+ app.runJob(testUsEast3).runJob(productionUsWest1);
+ assertEquals(Change.empty(), app.instance().change());
+
+ tester.clock().advance(Duration.ofDays(1)); // Enter block window.
+ app.submit(applicationPackage);
+ assertEquals(Change.empty(), app.instance().change());
+
+ Version version2 = new Version("7.2");
+ RevisionId revision2 = app.lastSubmission().get();
+
+ app.runJob(systemTest).runJob(stagingTest);
+ app.triggerJobs().assertNotRunning(productionUsEast3);
+ tester.controllerTester().upgradeSystem(version2);
+ tester.clock().advance(Duration.ofDays(1)); // Leave block window.
+ tester.upgrader().run();
+ tester.outstandingChangeDeployer().run();
+ assertEquals(Change.of(revision2).with(version2), app.instance().change());
+ app.runJob(systemTest).runJob(stagingTest);
+ app.runJob(productionUsEast3);
+ app.triggerJobs();
+ app.assertNotRunning(productionUsEast3); // Platform upgrade should not start before test is done with revision.
+ tester.clock().advance(Duration.ofHours(1));
+ app.triggerJobs();
+ app.assertNotRunning(productionUsEast3); // Platform upgrade should not start before test is done with revision.
+ app.runJob(testUsEast3);
+ app.runJob(productionUsEast3)
+ .runJob(productionUsWest1);
+ tester.clock().advance(Duration.ofHours(1));
+ app.runJob(testUsEast3);
+ app.runJob(productionUsWest1);
+ assertEquals(Change.empty(), app.instance().change());
+ }
+
}