From 4aee0a32075138bce8aeeee004c72feb2a35a3bc Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Tue, 26 Feb 2019 16:20:01 +0100 Subject: Run ACL maintainer if suspension fails --- .../admin/configserver/orchestrator/OrchestratorImpl.java | 2 +- .../vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'node-admin/src/main/java') diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java index 888e0195657..64a67aa612a 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java @@ -42,7 +42,7 @@ public class OrchestratorImpl implements Orchestrator { } catch (HttpException e) { throw new OrchestratorException("Failed to suspend " + hostName + ": " + e.toString()); - } catch (Exception e) { + } catch (RuntimeException e) { throw new RuntimeException("Got error on suspend", e); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index dc968a8717e..6e3a507c649 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -659,7 +659,20 @@ public class NodeAgentImpl implements NodeAgent { if (context.node().getState() != NodeState.active) return; context.log(logger, "Ask Orchestrator for permission to suspend node"); - orchestrator.suspend(context.hostname().value()); + try { + orchestrator.suspend(context.hostname().value()); + } catch (OrchestratorException e) { + // Ensure the ACLs are up to date: The reason we're unable to suspend may be because some other + // node is unable to resume because the ACL rules of SOME Docker container is wrong... + try { + aclMaintainer.ifPresent(maintainer -> maintainer.converge(context)); + } catch (RuntimeException suppressed) { + logger.log(LogLevel.WARNING, "Suppressing ACL update failure: " + suppressed); + e.addSuppressed(suppressed); + } + + throw e; + } } protected ContainerData createContainerData(NodeAgentContext context) { -- cgit v1.2.3