diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2020-01-22 14:16:02 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-01-22 14:16:02 +0100 |
commit | 77d1dde0fdf1f5af7a8ad85de5bb464ef01b1c0e (patch) | |
tree | 7886540bc7e3214186f131c829985d82816bc5c8 | |
parent | 7d1ed12140fb9b1ac023812582581946a09ac4b9 (diff) | |
parent | 5d1c98f314c34aae37aad6b473ed04eb593cbe4a (diff) |
Merge pull request #11892 from vespa-engine/jvenstad/step-runner-retries-for-limited-time
Jvenstad/step runner retries for limited time
2 files changed, 58 insertions, 32 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 9d09394a571..49282eaad0c 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -52,6 +52,7 @@ import java.security.cert.CertificateExpiredException; import java.security.cert.CertificateNotYetValidException; import java.security.cert.X509Certificate; import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.Date; import java.util.List; @@ -76,6 +77,11 @@ import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installatio import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.outOfCapacity; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.testFailure; +import static com.yahoo.vespa.hosted.controller.deployment.Step.deactivateReal; +import static com.yahoo.vespa.hosted.controller.deployment.Step.deactivateTester; +import static com.yahoo.vespa.hosted.controller.deployment.Step.deployInitialReal; +import static com.yahoo.vespa.hosted.controller.deployment.Step.deployReal; +import static com.yahoo.vespa.hosted.controller.deployment.Step.deployTester; import static com.yahoo.vespa.hosted.controller.deployment.Step.installTester; import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.logging.Level.INFO; @@ -186,6 +192,9 @@ public class InternalStepRunner implements StepRunner { vespaVersion, false, setTheStage)), + controller.jobController().run(id).get() + .stepInfo(setTheStage ? deployInitialReal : deployReal).get() + .startTime().get(), logger); } @@ -201,10 +210,14 @@ public class InternalStepRunner implements StepRunner { Optional.of(platform), false, false)), + controller.jobController().run(id).get() + .stepInfo(deployTester).get() + .startTime().get(), logger); } - private Optional<RunStatus> deploy(ApplicationId id, JobType type, Supplier<ActivateResult> deployment, DualLogger logger) { + private Optional<RunStatus> deploy(ApplicationId id, JobType type, Supplier<ActivateResult> deployment, + Instant startTime, DualLogger logger) { try { PrepareResponse prepareResponse = deployment.get().prepareResponse(); if ( ! prepareResponse.configChangeActions.refeedActions.stream().allMatch(action -> action.allowed)) { @@ -246,17 +259,20 @@ public class InternalStepRunner implements StepRunner { return Optional.of(running); } catch (ConfigServerException e) { + // Retry certain failures for up to one hour. + Optional<RunStatus> result = startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1))) + ? Optional.of(deploymentFailed) : Optional.empty(); switch (e.getErrorCode()) { case ACTIVATION_CONFLICT: case APPLICATION_LOCK_FAILURE: case CERTIFICATE_NOT_READY: logger.log("Deployment failed with possibly transient error " + e.getErrorCode() + ", will retry: " + e.getMessage()); - return Optional.empty(); + return result; case LOAD_BALANCER_NOT_READY: case PARENT_HOST_NOT_READY: logger.log(e.getServerMessage()); - return Optional.empty(); + return result; case OUT_OF_CAPACITY: logger.log(e.getServerMessage()); return Optional.of(outOfCapacity); @@ -537,48 +553,34 @@ public class InternalStepRunner implements StepRunner { private Optional<RunStatus> deactivateReal(RunId id, DualLogger logger) { try { - return retrying(10, () -> { - logger.log("Deactivating deployment of " + id.application() + " in " + id.type().zone(controller.system()) + " ..."); - controller.applications().deactivate(id.application(), id.type().zone(controller.system())); - return running; - }); + logger.log("Deactivating deployment of " + id.application() + " in " + id.type().zone(controller.system()) + " ..."); + controller.applications().deactivate(id.application(), id.type().zone(controller.system())); + return Optional.of(running); } catch (RuntimeException e) { logger.log(WARNING, "Failed deleting application " + id.application(), e); - return Optional.of(error); + Instant startTime = controller.jobController().run(id).get().stepInfo(deactivateReal).get().startTime().get(); + return startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1))) + ? Optional.of(error) + : Optional.empty(); } } private Optional<RunStatus> deactivateTester(RunId id, DualLogger logger) { try { - return retrying(10, () -> { - logger.log("Deactivating tester of " + id.application() + " in " + id.type().zone(controller.system()) + " ..."); - controller.jobController().deactivateTester(id.tester(), id.type()); - return running; - }); + logger.log("Deactivating tester of " + id.application() + " in " + id.type().zone(controller.system()) + " ..."); + controller.jobController().deactivateTester(id.tester(), id.type()); + return Optional.of(running); } catch (RuntimeException e) { logger.log(WARNING, "Failed deleting tester of " + id.application(), e); - return Optional.of(error); + Instant startTime = controller.jobController().run(id).get().stepInfo(deactivateTester).get().startTime().get(); + return startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1))) + ? Optional.of(error) + : Optional.empty(); } } - private static Optional<RunStatus> retrying(int retries, Supplier<RunStatus> task) { - RuntimeException exception = null; - do { - try { - return Optional.of(task.get()); - } - catch (RuntimeException e) { - if (exception == null) - exception = e; - else - exception.addSuppressed(e); - } - } while (--retries >= 0); - throw exception; - } - private Optional<RunStatus> report(RunId id, DualLogger logger) { try { controller.jobController().active(id).ifPresent(run -> { @@ -648,7 +650,7 @@ public class InternalStepRunner implements StepRunner { // TODO jonmv: This is a workaround for new deployment writes not yet being visible in spite of Curator locking. // TODO Investigate what's going on here, and remove this workaround. Run run = controller.jobController().run(id).get(); - if (run.start().isAfter(deployment.at())) + if ( ! controller.system().isCd() && run.start().isAfter(deployment.at())) return false; Duration timeout = controller.zoneRegistry().getDeploymentTimeToLive(deployment.zone()) diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java index e052b967c31..005f6a7351b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java @@ -18,6 +18,7 @@ import com.yahoo.vespa.hosted.controller.api.application.v4.model.configserverbi import com.yahoo.vespa.hosted.controller.api.application.v4.model.configserverbindings.ServiceInfo; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; import com.yahoo.vespa.hosted.controller.api.integration.LogEntry; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.ConfigServerException; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterCloud; @@ -50,6 +51,7 @@ import static com.yahoo.vespa.hosted.controller.api.integration.LogEntry.Type.wa import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.applicationPackage; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.publicCdApplicationPackage; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTester.instanceId; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed; import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.failed; import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded; import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.unfinished; @@ -96,6 +98,28 @@ public class InternalStepRunnerTest { } @Test + public void retriesDeploymentForOneHour() { + RuntimeException exception = new ConfigServerException(URI.create("https://server"), + "test failure", + "Exception to retry", + ConfigServerException.ErrorCode.APPLICATION_LOCK_FAILURE, + new RuntimeException("Retry me")); + tester.configServer().throwOnNextPrepare(exception); + tester.jobs().deploy(app.instanceId(), JobType.devUsEast1, Optional.empty(), applicationPackage); + assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.devUsEast1).get().stepStatuses().get(Step.deployReal)); + + tester.configServer().throwOnNextPrepare(exception); + tester.runner().run(); + assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.devUsEast1).get().stepStatuses().get(Step.deployReal)); + + tester.clock().advance(Duration.ofHours(1).plusSeconds(1)); + tester.configServer().throwOnNextPrepare(exception); + tester.runner().run(); + assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.devUsEast1).get().stepStatuses().get(Step.deployReal)); + assertEquals(deploymentFailed, tester.jobs().last(app.instanceId(), JobType.devUsEast1).get().status()); + } + + @Test public void refeedRequirementBlocksDeployment() { tester.configServer().setConfigChangeActions(new ConfigChangeActions(Collections.emptyList(), singletonList(new RefeedAction("Refeed", |