From 43e57094436eac53198f5f1f7933ee2d208c0a97 Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Wed, 21 Dec 2022 14:27:40 +0100 Subject: Reduce NodeFailer activate timeout --- .../com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'node-repository/src/main') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 203bb664c1c..84a45de39d7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.concurrent.UncheckedTimeoutException; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.NodeType; @@ -186,7 +187,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { */ private boolean failActive(FailingNode failing) { Optional deployment = - deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(30)); + deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(5)); if (deployment.isEmpty()) return false; // If the active node that we are trying to fail is of type host, we need to successfully fail all @@ -214,11 +215,12 @@ public class NodeFailer extends NodeRepositoryMaintainer { } if (activeChildrenToFail.isEmpty()) { + log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason); wantToFail(failing.node(), true, lock); try { deployment.get().activate(); return true; - } catch (TransientException e) { + } catch (TransientException | UncheckedTimeoutException e) { log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() + " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e)); -- cgit v1.2.3