aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <jonmv@users.noreply.github.com>2020-01-22 14:16:02 +0100
committerGitHub <noreply@github.com>2020-01-22 14:16:02 +0100
commit77d1dde0fdf1f5af7a8ad85de5bb464ef01b1c0e (patch)
tree7886540bc7e3214186f131c829985d82816bc5c8
parent7d1ed12140fb9b1ac023812582581946a09ac4b9 (diff)
parent5d1c98f314c34aae37aad6b473ed04eb593cbe4a (diff)
Merge pull request #11892 from vespa-engine/jvenstad/step-runner-retries-for-limited-time
Jvenstad/step runner retries for limited time
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java66
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java24
2 files changed, 58 insertions, 32 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index 9d09394a571..49282eaad0c 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -52,6 +52,7 @@ import java.security.cert.CertificateExpiredException;
import java.security.cert.CertificateNotYetValidException;
import java.security.cert.X509Certificate;
import java.time.Duration;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@@ -76,6 +77,11 @@ import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installatio
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.outOfCapacity;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.testFailure;
+import static com.yahoo.vespa.hosted.controller.deployment.Step.deactivateReal;
+import static com.yahoo.vespa.hosted.controller.deployment.Step.deactivateTester;
+import static com.yahoo.vespa.hosted.controller.deployment.Step.deployInitialReal;
+import static com.yahoo.vespa.hosted.controller.deployment.Step.deployReal;
+import static com.yahoo.vespa.hosted.controller.deployment.Step.deployTester;
import static com.yahoo.vespa.hosted.controller.deployment.Step.installTester;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.logging.Level.INFO;
@@ -186,6 +192,9 @@ public class InternalStepRunner implements StepRunner {
vespaVersion,
false,
setTheStage)),
+ controller.jobController().run(id).get()
+ .stepInfo(setTheStage ? deployInitialReal : deployReal).get()
+ .startTime().get(),
logger);
}
@@ -201,10 +210,14 @@ public class InternalStepRunner implements StepRunner {
Optional.of(platform),
false,
false)),
+ controller.jobController().run(id).get()
+ .stepInfo(deployTester).get()
+ .startTime().get(),
logger);
}
- private Optional<RunStatus> deploy(ApplicationId id, JobType type, Supplier<ActivateResult> deployment, DualLogger logger) {
+ private Optional<RunStatus> deploy(ApplicationId id, JobType type, Supplier<ActivateResult> deployment,
+ Instant startTime, DualLogger logger) {
try {
PrepareResponse prepareResponse = deployment.get().prepareResponse();
if ( ! prepareResponse.configChangeActions.refeedActions.stream().allMatch(action -> action.allowed)) {
@@ -246,17 +259,20 @@ public class InternalStepRunner implements StepRunner {
return Optional.of(running);
}
catch (ConfigServerException e) {
+ // Retry certain failures for up to one hour.
+ Optional<RunStatus> result = startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1)))
+ ? Optional.of(deploymentFailed) : Optional.empty();
switch (e.getErrorCode()) {
case ACTIVATION_CONFLICT:
case APPLICATION_LOCK_FAILURE:
case CERTIFICATE_NOT_READY:
logger.log("Deployment failed with possibly transient error " + e.getErrorCode() +
", will retry: " + e.getMessage());
- return Optional.empty();
+ return result;
case LOAD_BALANCER_NOT_READY:
case PARENT_HOST_NOT_READY:
logger.log(e.getServerMessage());
- return Optional.empty();
+ return result;
case OUT_OF_CAPACITY:
logger.log(e.getServerMessage());
return Optional.of(outOfCapacity);
@@ -537,48 +553,34 @@ public class InternalStepRunner implements StepRunner {
private Optional<RunStatus> deactivateReal(RunId id, DualLogger logger) {
try {
- return retrying(10, () -> {
- logger.log("Deactivating deployment of " + id.application() + " in " + id.type().zone(controller.system()) + " ...");
- controller.applications().deactivate(id.application(), id.type().zone(controller.system()));
- return running;
- });
+ logger.log("Deactivating deployment of " + id.application() + " in " + id.type().zone(controller.system()) + " ...");
+ controller.applications().deactivate(id.application(), id.type().zone(controller.system()));
+ return Optional.of(running);
}
catch (RuntimeException e) {
logger.log(WARNING, "Failed deleting application " + id.application(), e);
- return Optional.of(error);
+ Instant startTime = controller.jobController().run(id).get().stepInfo(deactivateReal).get().startTime().get();
+ return startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1)))
+ ? Optional.of(error)
+ : Optional.empty();
}
}
private Optional<RunStatus> deactivateTester(RunId id, DualLogger logger) {
try {
- return retrying(10, () -> {
- logger.log("Deactivating tester of " + id.application() + " in " + id.type().zone(controller.system()) + " ...");
- controller.jobController().deactivateTester(id.tester(), id.type());
- return running;
- });
+ logger.log("Deactivating tester of " + id.application() + " in " + id.type().zone(controller.system()) + " ...");
+ controller.jobController().deactivateTester(id.tester(), id.type());
+ return Optional.of(running);
}
catch (RuntimeException e) {
logger.log(WARNING, "Failed deleting tester of " + id.application(), e);
- return Optional.of(error);
+ Instant startTime = controller.jobController().run(id).get().stepInfo(deactivateTester).get().startTime().get();
+ return startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1)))
+ ? Optional.of(error)
+ : Optional.empty();
}
}
- private static Optional<RunStatus> retrying(int retries, Supplier<RunStatus> task) {
- RuntimeException exception = null;
- do {
- try {
- return Optional.of(task.get());
- }
- catch (RuntimeException e) {
- if (exception == null)
- exception = e;
- else
- exception.addSuppressed(e);
- }
- } while (--retries >= 0);
- throw exception;
- }
-
private Optional<RunStatus> report(RunId id, DualLogger logger) {
try {
controller.jobController().active(id).ifPresent(run -> {
@@ -648,7 +650,7 @@ public class InternalStepRunner implements StepRunner {
// TODO jonmv: This is a workaround for new deployment writes not yet being visible in spite of Curator locking.
// TODO Investigate what's going on here, and remove this workaround.
Run run = controller.jobController().run(id).get();
- if (run.start().isAfter(deployment.at()))
+ if ( ! controller.system().isCd() && run.start().isAfter(deployment.at()))
return false;
Duration timeout = controller.zoneRegistry().getDeploymentTimeToLive(deployment.zone())
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
index e052b967c31..005f6a7351b 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
@@ -18,6 +18,7 @@ import com.yahoo.vespa.hosted.controller.api.application.v4.model.configserverbi
import com.yahoo.vespa.hosted.controller.api.application.v4.model.configserverbindings.ServiceInfo;
import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId;
import com.yahoo.vespa.hosted.controller.api.integration.LogEntry;
+import com.yahoo.vespa.hosted.controller.api.integration.configserver.ConfigServerException;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterCloud;
@@ -50,6 +51,7 @@ import static com.yahoo.vespa.hosted.controller.api.integration.LogEntry.Type.wa
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.applicationPackage;
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.publicCdApplicationPackage;
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTester.instanceId;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed;
import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.failed;
import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded;
import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.unfinished;
@@ -96,6 +98,28 @@ public class InternalStepRunnerTest {
}
@Test
+ public void retriesDeploymentForOneHour() {
+ RuntimeException exception = new ConfigServerException(URI.create("https://server"),
+ "test failure",
+ "Exception to retry",
+ ConfigServerException.ErrorCode.APPLICATION_LOCK_FAILURE,
+ new RuntimeException("Retry me"));
+ tester.configServer().throwOnNextPrepare(exception);
+ tester.jobs().deploy(app.instanceId(), JobType.devUsEast1, Optional.empty(), applicationPackage);
+ assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.devUsEast1).get().stepStatuses().get(Step.deployReal));
+
+ tester.configServer().throwOnNextPrepare(exception);
+ tester.runner().run();
+ assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.devUsEast1).get().stepStatuses().get(Step.deployReal));
+
+ tester.clock().advance(Duration.ofHours(1).plusSeconds(1));
+ tester.configServer().throwOnNextPrepare(exception);
+ tester.runner().run();
+ assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.devUsEast1).get().stepStatuses().get(Step.deployReal));
+ assertEquals(deploymentFailed, tester.jobs().last(app.instanceId(), JobType.devUsEast1).get().status());
+ }
+
+ @Test
public void refeedRequirementBlocksDeployment() {
tester.configServer().setConfigChangeActions(new ConfigChangeActions(Collections.emptyList(),
singletonList(new RefeedAction("Refeed",