diff options
Diffstat (limited to 'controller-server')
3 files changed, 78 insertions, 4 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java index c3424b8d9af..6c57c9423ff 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java @@ -48,9 +48,6 @@ public class FailureRedeployer extends Maintainer { private void retryStuckJobs(List<Application> applications) { Instant maxAge = controller().clock().instant().minus(jobTimeout); for (Application application : applications) { - if (!application.deploying().isPresent()) { - continue; - } Optional<JobStatus> job = oldestRunningJob(application); if (!job.isPresent()) { continue; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MockMetricsService.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MockMetricsService.java index 360fd8616d3..79b4c5f6d6a 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MockMetricsService.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MockMetricsService.java @@ -4,6 +4,9 @@ package com.yahoo.vespa.hosted.controller.integration; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Zone; +import java.util.Collections; +import java.util.List; + /** * @author bratseth */ @@ -19,4 +22,11 @@ public class MockMetricsService implements com.yahoo.vespa.hosted.controller.api return new DeploymentMetrics(1, 2, 3, 4, 5); } + @Override + public List<ClusterCostMetrics> getClusterCostMetrics(ApplicationId application, Zone zone) { + CostMetrics costMetrics = new CostMetrics(55.54, 69.90, 34.59); + ClusterCostMetrics clusterCostMetrics = new ClusterCostMetrics("default", costMetrics); + return Collections.singletonList(clusterCostMetrics); + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java index b5ee0469e9f..38ddd8a4a1b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java @@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.persistence.ApplicationSerializer; +import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Test; import java.nio.file.Files; @@ -120,13 +121,79 @@ public class FailureRedeployerTest { tester.failureRedeployer().maintain(); assertEquals(DeploymentJobs.JobType.component.id(), tester.buildSystem().takeJobsToRun().get(0).jobName()); - // Ensure that system-test is trigered after component. Triggering component records a new change, but in this + // Ensure that system-test is triggered after component. Triggering component records a new change, but in this // case there's already a change in progress which we want to discard and start over tester.notifyJobCompletion(DeploymentJobs.JobType.component, app, true); assertEquals(DeploymentJobs.JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName()); } @Test + public void testAlwaysRestartsDeploymentOfApplicationsWithStuckJobs() { + DeploymentTester tester = new DeploymentTester(); + Version version = Version.fromString("5.0"); + tester.updateVersionStatus(version); + + ApplicationPackage applicationPackage = new ApplicationPackageBuilder() + .environment(Environment.prod) + .region("us-west-1") + .build(); + + // Setup applications + Application canary0 = tester.createAndDeploy("canary0", 0, "canary"); + Application canary1 = tester.createAndDeploy("canary1", 1, "canary"); + Application default0 = tester.createAndDeploy("default0", 2, "default"); + Application default1 = tester.createAndDeploy("default1", 3, "default"); + Application default2 = tester.createAndDeploy("default2", 4, "default"); + Application default3 = tester.createAndDeploy("default3", 5, "default"); + Application default4 = tester.createAndDeploy("default4", 6, "default"); + + // New version is released + version = Version.fromString("5.1"); + tester.updateVersionStatus(version); + assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); + tester.upgrader().maintain(); + + // Canaries upgrade and raise confidence + tester.completeUpgrade(canary0, version, "canary"); + tester.completeUpgrade(canary1, version, "canary"); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Applications with default policy start upgrading + tester.clock().advance(Duration.ofMinutes(1)); + tester.upgrader().maintain(); + assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size()); + + // 4/5 applications fail, confidence is lowered and upgrade is cancelled + tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); + tester.upgrader().maintain(); + + // 5th app never reports back and has a dead locked job, but no ongoing change + Application deadLocked = tester.applications().require(default4.id()); + assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress()); + assertFalse("No change present", deadLocked.deploying().isPresent()); + + // 4/5 applications are repaired and confidence is restored + tester.deployCompletely(default0, applicationPackage); + tester.deployCompletely(default1, applicationPackage); + tester.deployCompletely(default2, applicationPackage); + tester.deployCompletely(default3, applicationPackage); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Over 12 hours pass and failure redeployer restarts deployment of 5th app + tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); + tester.failureRedeployer().maintain(); + assertEquals("Deployment is restarted", DeploymentJobs.JobType.component.id(), + tester.buildSystem().jobs().get(0).jobName()); + } + + @Test public void testRetriesJobsFailingForCurrentChange() { DeploymentTester tester = new DeploymentTester(); ApplicationPackage applicationPackage = new ApplicationPackageBuilder() |