diff options
author | jonmv <venstad@gmail.com> | 2022-09-22 12:17:23 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2022-09-22 12:17:23 +0200 |
commit | 26478576dede3f788144682c167ec4fd3f82be30 (patch) | |
tree | 55c51504671b9af7796af4992a3f33537e3db8f3 /controller-server | |
parent | 1957bf2e63c2ad903bd1f6ccbde48de1141dca11 (diff) |
Allow 6 failing apps before broken confidence
Diffstat (limited to 'controller-server')
3 files changed, 32 insertions, 15 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/VespaVersion.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/VespaVersion.java index 0814e9ea6ec..46bcd3b85c0 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/VespaVersion.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/VespaVersion.java @@ -44,7 +44,7 @@ public record VespaVersion(Version version, if ( ! failingOnThis.with(UpgradePolicy.canary).isEmpty()) return Confidence.broken; - // 'broken' if 4 non-canary was broken by this, and that is at least 5% of all + // 'broken' if 6 non-canary was broken by this, and that is at least 5% of all if (nonCanaryApplicationsBroken(statistics.version(), failingOnThis, productionOnThis)) return Confidence.broken; @@ -161,9 +161,9 @@ public record VespaVersion(Version version, if (productionNonCanaries.size() + failingNonCanaries.size() == 0) return false; - // 'broken' if 4 non-canary was broken by this, and that is at least 5% of all + // 'broken' if 6 non-canary was broken by this, and that is at least 5% of all int brokenByThisVersion = failingNonCanaries.size(); - return brokenByThisVersion >= 4 && brokenByThisVersion >= productionOnThis.size() * 0.05; + return brokenByThisVersion >= 6 && brokenByThisVersion >= productionOnThis.size() * 0.05; } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 35ccc86d465..45038fc4a63 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -189,6 +189,7 @@ public class UpgraderTest { // --- Starting upgrading to a new version which breaks, causing upgrades to commence on the previous version var default3 = createAndDeploy("default3", "default"); var default4 = createAndDeploy("default4", "default"); + var default5 = createAndDeploy("default5", "default"); Version version4 = Version.fromString("6.6"); tester.controllerTester().upgradeSystem(version4); tester.upgrader().maintain(); // cause canary upgrades to new version @@ -200,8 +201,8 @@ public class UpgraderTest { tester.upgrader().maintain(); tester.triggerJobs(); - assertEquals(10, tester.jobs().active().size(), "Upgrade of defaults are scheduled"); - for (var context : List.of(default0, default1, default2, default3, default4)) + assertEquals(12, tester.jobs().active().size(), "Upgrade of defaults are scheduled"); + for (var context : List.of(default0, default1, default2, default3, default4, default5)) assertEquals(version4, context.instance().change().platform().get()); default0.deployPlatform(version4); @@ -218,9 +219,9 @@ public class UpgraderTest { tester.upgrader().maintain(); tester.triggerJobs(); - assertEquals(10, tester.jobs().active().size(), "Upgrade of defaults are scheduled"); + assertEquals(12, tester.jobs().active().size(), "Upgrade of defaults are scheduled"); assertEquals(version5, default0.instance().change().platform().get()); - for (var context : List.of(default1, default2, default3, default4)) + for (var context : List.of(default1, default2, default3, default4, default5)) assertEquals(version4, context.instance().change().platform().get()); default1.deployPlatform(version4); @@ -233,6 +234,10 @@ public class UpgraderTest { .runJob(stagingTest) .failDeployment(productionUsWest1); + default5.runJob(systemTest) + .runJob(stagingTest) + .failDeployment(productionUsWest1); + // State: Default applications started upgrading to version5 tester.clock().advance(Duration.ofHours(1)); tester.upgrader().maintain(); @@ -247,10 +252,12 @@ public class UpgraderTest { default3.runJob(systemTest) .runJob(stagingTest) .failDeployment(productionUsWest1); + default4.failDeployment(systemTest); + default5.failDeployment(systemTest); + tester.controllerTester().computeVersionStatus(); assertEquals(VespaVersion.Confidence.broken, tester.controller().readVersionStatus().systemVersion().get().confidence()); - tester.upgrader().maintain(); assertEquals(version4, default3.instance().change().platform().get()); } @@ -311,10 +318,10 @@ public class UpgraderTest { assertEquals(20, tester.jobs().active().size(), "Canaries done: Should upgrade defaults"); default0.deployPlatform(version); - for (var context : List.of(default1, default2, default3, default4)) + for (var context : List.of(default1, default2, default3, default4, default5, default6)) context.failDeployment(systemTest); - // > 40% and at least 4 failed - version is broken + // > 60% and at least 6 failed - version is broken tester.controllerTester().computeVersionStatus(); tester.upgrader().maintain(); tester.abortAll(); @@ -342,6 +349,8 @@ public class UpgraderTest { var default2 = createAndDeploy("default2", "default"); var default3 = createAndDeploy("default3", "default"); var default4 = createAndDeploy("default4", "default"); + var default5 = createAndDeploy("default5", "default"); + var default6 = createAndDeploy("default6", "default"); // V1 is released Version v1 = Version.fromString("6.3"); @@ -372,27 +381,33 @@ public class UpgraderTest { assertEquals(VespaVersion.Confidence.normal, tester.controller().readVersionStatus().systemVersion().get().confidence()); // We "manually" cancel upgrades to V1 so that we can use the applications to make V2 fail instead - // But we keep one (default4) to avoid V1 being garbage collected + // But we keep one (default6) to avoid V1 being garbage collected tester.deploymentTrigger().cancelChange(default0.instanceId(), ALL); tester.deploymentTrigger().cancelChange(default1.instanceId(), ALL); tester.deploymentTrigger().cancelChange(default2.instanceId(), ALL); tester.deploymentTrigger().cancelChange(default3.instanceId(), ALL); + tester.deploymentTrigger().cancelChange(default4.instanceId(), ALL); + tester.deploymentTrigger().cancelChange(default5.instanceId(), ALL); default0.abortJob(systemTest).abortJob(stagingTest); default1.abortJob(systemTest).abortJob(stagingTest); default2.abortJob(systemTest).abortJob(stagingTest); default3.abortJob(systemTest).abortJob(stagingTest); + default4.abortJob(systemTest).abortJob(stagingTest); + default5.abortJob(systemTest).abortJob(stagingTest); // Applications with default policy start upgrading to V2 tester.upgrader().maintain(); tester.triggerJobs(); - assertEquals(10, tester.jobs().active().size(), "Upgrade scheduled for remaining apps"); - assertEquals(v1, default4.instance().change().platform().get(), "default4 is still upgrading to 6.3"); + assertEquals(14, tester.jobs().active().size(), "Upgrade scheduled for remaining apps"); + assertEquals(v1, default6.instance().change().platform().get(), "default6 is still upgrading to 6.3"); // 4/5 applications fail (in the last prod zone) and lowers confidence default0.runJob(systemTest).runJob(stagingTest).runJob(productionUsWest1).failDeployment(productionUsEast3); default1.runJob(systemTest).runJob(stagingTest).runJob(productionUsWest1).failDeployment(productionUsEast3); default2.runJob(systemTest).runJob(stagingTest).runJob(productionUsWest1).failDeployment(productionUsEast3); default3.runJob(systemTest).runJob(stagingTest).runJob(productionUsWest1).failDeployment(productionUsEast3); + default4.runJob(systemTest).runJob(stagingTest).runJob(productionUsWest1).failDeployment(productionUsEast3); + default5.runJob(systemTest).runJob(stagingTest).runJob(productionUsWest1).failDeployment(productionUsEast3); tester.controllerTester().computeVersionStatus(); assertEquals(VespaVersion.Confidence.broken, tester.controller().readVersionStatus().systemVersion().get().confidence()); @@ -402,7 +417,7 @@ public class UpgraderTest { tester.abortAll(); tester.triggerJobs(); - assertEquals(10, tester.jobs().active().size(), "Upgrade to 5.1 scheduled for apps not completely on 5.1 or 5.2"); + assertEquals(14, tester.jobs().active().size(), "Upgrade to 5.1 scheduled for apps not completely on 5.1 or 5.2"); // prod zone on 5.2 (usWest1) is skipped, but we still trigger the next zone from triggerReadyJobs: default0.runJob(systemTest).runJob(stagingTest).runJob(productionUsEast3); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java index ab262a1a3c5..72179aa669c 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java @@ -367,11 +367,13 @@ public class VersionStatusTest { default1.failDeployment(stagingTest); default2.failDeployment(stagingTest); default3.failDeployment(stagingTest); + default4.failDeployment(stagingTest); + default5.failDeployment(stagingTest); tester.controllerTester().computeVersionStatus(); assertEquals(Confidence.high, confidence(tester.controller(), version0), "Confidence remains unchanged for version0: High"); assertEquals(Confidence.high, confidence(tester.controller(), version2), "Confidence remains unchanged for version2: High"); - assertEquals(VespaVersion.Confidence.broken, confidence(tester.controller(), version3), "40% of defaults failed: Broken"); + assertEquals(VespaVersion.Confidence.broken, confidence(tester.controller(), version3), "60% of defaults failed: Broken"); // Test version order List<VespaVersion> versions = tester.controller().readVersionStatus().versions(); |