diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2023-08-14 09:53:37 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-14 09:53:37 +0200 |
commit | 5ad80a4933c3bf201b9ae8247374995f81417c1d (patch) | |
tree | 383b2dec04b9d79b0fbd8d5d5c1ed20aa4c0214d | |
parent | 624697ccb1fe5b2aa5d58ff4faf882558471494a (diff) | |
parent | dd9abd1bd7f5064c9e14f961c5c00daa536bf4fa (diff) |
Merge pull request #28027 from vespa-engine/hakonhall/always-run-acl-and-wireguard-maintainers-if-container-is-present
Always run acl and wireguard maintainers if container is present
-rw-r--r-- | node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java | 34 |
1 files changed, 13 insertions, 21 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 7fc248024c3..284306e1e8c 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -17,7 +17,6 @@ import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeState; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.reports.DropDocumentsReport; import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; -import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException; import com.yahoo.vespa.hosted.node.admin.container.Container; import com.yahoo.vespa.hosted.node.admin.container.ContainerOperations; import com.yahoo.vespa.hosted.node.admin.container.ContainerResources; @@ -484,6 +483,11 @@ public class NodeAgentImpl implements NodeAgent { lastNode = node; } + // Run this here and now, even though we may immediately remove the container below. + // This ensures these maintainers are run even if something fails or returns early. + // These maintainers should also run immediately after starting the container (see below). + container.ifPresent(c -> runImportantContainerMaintainers(context, c)); + switch (node.state()) { case ready, reserved, failed, inactive, parked -> { storageMaintainer.syncLogs(context, true); @@ -508,13 +512,11 @@ public class NodeAgentImpl implements NodeAgent { containerState = STARTING; container = Optional.of(startContainer(context)); containerState = UNKNOWN; + runImportantContainerMaintainers(context, container.get()); } else { container = Optional.of(updateContainerIfNeeded(context, container.get())); } - aclMaintainer.ifPresent(maintainer -> maintainer.converge(context)); - final Optional<Container> finalContainer = container; - wireguardTasks.forEach(task -> task.converge(context, finalContainer.get().id())); startServicesIfNeeded(context); resumeNodeIfNeeded(context); if (healthChecker.isPresent()) { @@ -559,6 +561,11 @@ public class NodeAgentImpl implements NodeAgent { } } + private void runImportantContainerMaintainers(NodeAgentContext context, Container container) { + aclMaintainer.ifPresent(maintainer -> maintainer.converge(context)); + wireguardTasks.forEach(task -> task.converge(context, container.id())); + } + private static void logChangesToNodeSpec(NodeAgentContext context, NodeSpec lastNode, NodeSpec node) { StringBuilder builder = new StringBuilder(); appendIfDifferent(builder, "state", lastNode, node, NodeSpec::state); @@ -600,23 +607,8 @@ public class NodeAgentImpl implements NodeAgent { if (context.node().state() != NodeState.active) return; context.log(logger, "Ask Orchestrator for permission to suspend node"); - try { - orchestrator.suspend(context.hostname().value()); - suspendedInOrchestrator = true; - } catch (OrchestratorException e) { - // Ensure the ACLs are up to date: The reason we're unable to suspend may be because some other - // node is unable to resume because the ACL rules of SOME Docker container is wrong... - // Same can happen with stale WireGuard config, so update that too - try { - aclMaintainer.ifPresent(maintainer -> maintainer.converge(context)); - wireguardTasks.forEach(task -> getContainer(context).ifPresent(c -> task.converge(context, c.id()))); - } catch (RuntimeException suppressed) { - logger.log(Level.WARNING, "Suppressing ACL update failure: " + suppressed); - e.addSuppressed(suppressed); - } - - throw e; - } + orchestrator.suspend(context.hostname().value()); + suspendedInOrchestrator = true; } protected void writeContainerData(NodeAgentContext context, ContainerData containerData) { } |