diff options
author | HÃ¥kon Hallingstad <hakon@oath.com> | 2019-02-26 17:38:40 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-02-26 17:38:40 +0100 |
commit | ba131f00cf4b0e8196ad887104af9024832126da (patch) | |
tree | 843f6e94280b227bc6f31f15a222ebc20a997ffe | |
parent | b235a61e3b37c0140fda20133ff317ccd7900dea (diff) | |
parent | 4aee0a32075138bce8aeeee004c72feb2a35a3bc (diff) |
Merge pull request #8618 from vespa-engine/hakonhall/run-acl-maintainer-if-suspension-fails
Run ACL maintainer if suspension fails
3 files changed, 18 insertions, 2 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java index 888e0195657..64a67aa612a 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java @@ -42,7 +42,7 @@ public class OrchestratorImpl implements Orchestrator { } catch (HttpException e) { throw new OrchestratorException("Failed to suspend " + hostName + ": " + e.toString()); - } catch (Exception e) { + } catch (RuntimeException e) { throw new RuntimeException("Got error on suspend", e); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index dc968a8717e..6e3a507c649 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -659,7 +659,20 @@ public class NodeAgentImpl implements NodeAgent { if (context.node().getState() != NodeState.active) return; context.log(logger, "Ask Orchestrator for permission to suspend node"); - orchestrator.suspend(context.hostname().value()); + try { + orchestrator.suspend(context.hostname().value()); + } catch (OrchestratorException e) { + // Ensure the ACLs are up to date: The reason we're unable to suspend may be because some other + // node is unable to resume because the ACL rules of SOME Docker container is wrong... + try { + aclMaintainer.ifPresent(maintainer -> maintainer.converge(context)); + } catch (RuntimeException suppressed) { + logger.log(LogLevel.WARNING, "Suppressing ACL update failure: " + suppressed); + e.addSuppressed(suppressed); + } + + throw e; + } } protected ContainerData createContainerData(NodeAgentContext context) { diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 3130500c940..8b0c3044e7c 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -346,6 +346,9 @@ public class NodeAgentImplTest { verify(dockerOperations, never()).startContainer(eq(context)); verify(orchestrator, never()).resume(any(String.class)); verify(nodeRepository, never()).updateNodeAttributes(any(String.class), any(NodeAttributes.class)); + + // Verify aclMaintainer is called even if suspension fails + verify(aclMaintainer, times(1)).converge(eq(context)); } @Test |