diff options
author | hakonhall <hakon@yahoo-inc.com> | 2017-06-28 00:06:08 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-06-28 00:06:08 +0200 |
commit | 98ef3fbb6d57fd4a12e169c258e801d8f5f6c3e8 (patch) | |
tree | 7e4d65cbd872983a7ce51154963f79a1c796e37c /node-admin | |
parent | 1429f2d600dc9f68f521769fe0a20ab24cee23d2 (diff) |
Revert "Resume before updating node repo"
Diffstat (limited to 'node-admin')
2 files changed, 16 insertions, 27 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index de8f2f183ca..8169b9daee8 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -467,13 +467,19 @@ public class NodeAgentImpl implements NodeAgent { } runLocalResumeScriptIfNeeded(nodeSpec); - - orchestrator.resume(hostname); - - // Updating the node repo is best-effort: Since resume has succeeded when getting here, - // suspension of the next node wil be granted. However, the rollout of a new release IS - // gated on this succeeding. + // Because it's more important to stop a bad release from rolling out in prod, + // we put the resume call last. So if we fail after updating the node repo attributes + // but before resume, the app may go through the tenant pipeline but will halt in prod. + // + // Note that this problem exists only because there are 2 different mechanisms + // that should really be parts of a single mechanism: + // - The content of node repo is used to determine whether a new Vespa+application + // has been successfully rolled out. + // - Slobrok and internal orchestrator state is used to determine whether + // to allow upgrade (suspend). updateNodeRepoWithCurrentAttributes(nodeSpec); + logger.info("Call resume against Orchestrator"); + orchestrator.resume(hostname); break; case inactive: storageMaintainer.ifPresent(maintainer -> maintainer.removeOldFilesFromNode(containerName)); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 0f5721a5d75..0264e2ac102 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -117,8 +117,9 @@ public class NodeAgentImplTest { verify(dockerOperations, never()).scheduleDownloadOfImage(eq(containerName), any(), any()); final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository); - inOrder.verify(dockerOperations).resumeNode(eq(containerName)); - inOrder.verify(orchestrator).resume(hostName); + // TODO: Verify this isn't run unless 1st time + inOrder.verify(dockerOperations, times(1)).resumeNode(eq(containerName)); + // TODO: This should not happen when nothing is changed. Now it happens 1st time through. inOrder.verify(nodeRepository).updateNodeAttributes( hostName, new NodeAttributes() @@ -126,24 +127,7 @@ public class NodeAgentImplTest { .withRebootGeneration(rebootGeneration) .withDockerImage(dockerImage) .withVespaVersion(vespaVersion)); - - nodeAgent.converge(); - - verify(dockerOperations, never()).removeContainer(any()); - verify(orchestrator, never()).suspend(any(String.class)); - verify(dockerOperations, never()).scheduleDownloadOfImage(eq(containerName), any(), any()); - - // Should not be called 2nd time - inOrder.verify(dockerOperations, never()).resumeNode(eq(containerName)); inOrder.verify(orchestrator).resume(hostName); - // Should not be called 2nd time - inOrder.verify(nodeRepository, never()).updateNodeAttributes( - hostName, - new NodeAttributes() - .withRestartGeneration(restartGeneration) - .withRebootGeneration(rebootGeneration) - .withDockerImage(dockerImage) - .withVespaVersion(vespaVersion)); } @Test @@ -174,13 +158,13 @@ public class NodeAgentImplTest { inOrder.verify(aclMaintainer, times(1)).run(); inOrder.verify(dockerOperations, times(1)).startContainer(eq(containerName), eq(nodeSpec)); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(containerName)); - inOrder.verify(orchestrator).resume(hostName); inOrder.verify(nodeRepository).updateNodeAttributes( hostName, new NodeAttributes() .withRestartGeneration(restartGeneration) .withRebootGeneration(rebootGeneration) .withDockerImage(dockerImage) .withVespaVersion(vespaVersion)); + inOrder.verify(orchestrator).resume(hostName); } @Test @@ -479,7 +463,6 @@ public class NodeAgentImplTest { inOrder.verify(dockerOperations).resumeNode(any()); inOrder.verify(orchestrator).resume(hostName); - inOrder.verify(nodeRepository).updateNodeAttributes(eq(hostName), any()); inOrder.verifyNoMoreInteractions(); } |