diff options
Diffstat (limited to 'node-admin')
2 files changed, 27 insertions, 16 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 8169b9daee8..de8f2f183ca 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -467,19 +467,13 @@ public class NodeAgentImpl implements NodeAgent { } runLocalResumeScriptIfNeeded(nodeSpec); - // Because it's more important to stop a bad release from rolling out in prod, - // we put the resume call last. So if we fail after updating the node repo attributes - // but before resume, the app may go through the tenant pipeline but will halt in prod. - // - // Note that this problem exists only because there are 2 different mechanisms - // that should really be parts of a single mechanism: - // - The content of node repo is used to determine whether a new Vespa+application - // has been successfully rolled out. - // - Slobrok and internal orchestrator state is used to determine whether - // to allow upgrade (suspend). - updateNodeRepoWithCurrentAttributes(nodeSpec); - logger.info("Call resume against Orchestrator"); + orchestrator.resume(hostname); + + // Updating the node repo is best-effort: Since resume has succeeded when getting here, + // suspension of the next node wil be granted. However, the rollout of a new release IS + // gated on this succeeding. + updateNodeRepoWithCurrentAttributes(nodeSpec); break; case inactive: storageMaintainer.ifPresent(maintainer -> maintainer.removeOldFilesFromNode(containerName)); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 0264e2ac102..0f5721a5d75 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -117,9 +117,8 @@ public class NodeAgentImplTest { verify(dockerOperations, never()).scheduleDownloadOfImage(eq(containerName), any(), any()); final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository); - // TODO: Verify this isn't run unless 1st time - inOrder.verify(dockerOperations, times(1)).resumeNode(eq(containerName)); - // TODO: This should not happen when nothing is changed. Now it happens 1st time through. + inOrder.verify(dockerOperations).resumeNode(eq(containerName)); + inOrder.verify(orchestrator).resume(hostName); inOrder.verify(nodeRepository).updateNodeAttributes( hostName, new NodeAttributes() @@ -127,7 +126,24 @@ public class NodeAgentImplTest { .withRebootGeneration(rebootGeneration) .withDockerImage(dockerImage) .withVespaVersion(vespaVersion)); + + nodeAgent.converge(); + + verify(dockerOperations, never()).removeContainer(any()); + verify(orchestrator, never()).suspend(any(String.class)); + verify(dockerOperations, never()).scheduleDownloadOfImage(eq(containerName), any(), any()); + + // Should not be called 2nd time + inOrder.verify(dockerOperations, never()).resumeNode(eq(containerName)); inOrder.verify(orchestrator).resume(hostName); + // Should not be called 2nd time + inOrder.verify(nodeRepository, never()).updateNodeAttributes( + hostName, + new NodeAttributes() + .withRestartGeneration(restartGeneration) + .withRebootGeneration(rebootGeneration) + .withDockerImage(dockerImage) + .withVespaVersion(vespaVersion)); } @Test @@ -158,13 +174,13 @@ public class NodeAgentImplTest { inOrder.verify(aclMaintainer, times(1)).run(); inOrder.verify(dockerOperations, times(1)).startContainer(eq(containerName), eq(nodeSpec)); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(containerName)); + inOrder.verify(orchestrator).resume(hostName); inOrder.verify(nodeRepository).updateNodeAttributes( hostName, new NodeAttributes() .withRestartGeneration(restartGeneration) .withRebootGeneration(rebootGeneration) .withDockerImage(dockerImage) .withVespaVersion(vespaVersion)); - inOrder.verify(orchestrator).resume(hostName); } @Test @@ -463,6 +479,7 @@ public class NodeAgentImplTest { inOrder.verify(dockerOperations).resumeNode(any()); inOrder.verify(orchestrator).resume(hostName); + inOrder.verify(nodeRepository).updateNodeAttributes(eq(hostName), any()); inOrder.verifyNoMoreInteractions(); } |