aboutsummaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorhakonhall <hakon@yahoo-inc.com>2017-06-27 13:39:07 +0200
committerGitHub <noreply@github.com>2017-06-27 13:39:07 +0200
commit2308a73bce15a98c8d748cccca55904f80ff7180 (patch)
tree146f6caf0af81a94ee55219946a946efdb7c87c3 /node-admin
parenta94927360efdeb67e1bfd8137d5a6dd921a10a18 (diff)
parent696b53aec5f0719dfeb7036870674351ea8f5c70 (diff)
Merge pull request #2887 from yahoo/hakon/resume-before-updating-node-repo
Resume before updating node repo
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java18
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java25
2 files changed, 27 insertions, 16 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 8169b9daee8..de8f2f183ca 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -467,19 +467,13 @@ public class NodeAgentImpl implements NodeAgent {
}
runLocalResumeScriptIfNeeded(nodeSpec);
- // Because it's more important to stop a bad release from rolling out in prod,
- // we put the resume call last. So if we fail after updating the node repo attributes
- // but before resume, the app may go through the tenant pipeline but will halt in prod.
- //
- // Note that this problem exists only because there are 2 different mechanisms
- // that should really be parts of a single mechanism:
- // - The content of node repo is used to determine whether a new Vespa+application
- // has been successfully rolled out.
- // - Slobrok and internal orchestrator state is used to determine whether
- // to allow upgrade (suspend).
- updateNodeRepoWithCurrentAttributes(nodeSpec);
- logger.info("Call resume against Orchestrator");
+
orchestrator.resume(hostname);
+
+ // Updating the node repo is best-effort: Since resume has succeeded when getting here,
+ // suspension of the next node wil be granted. However, the rollout of a new release IS
+ // gated on this succeeding.
+ updateNodeRepoWithCurrentAttributes(nodeSpec);
break;
case inactive:
storageMaintainer.ifPresent(maintainer -> maintainer.removeOldFilesFromNode(containerName));
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 0264e2ac102..0f5721a5d75 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -117,9 +117,8 @@ public class NodeAgentImplTest {
verify(dockerOperations, never()).scheduleDownloadOfImage(eq(containerName), any(), any());
final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository);
- // TODO: Verify this isn't run unless 1st time
- inOrder.verify(dockerOperations, times(1)).resumeNode(eq(containerName));
- // TODO: This should not happen when nothing is changed. Now it happens 1st time through.
+ inOrder.verify(dockerOperations).resumeNode(eq(containerName));
+ inOrder.verify(orchestrator).resume(hostName);
inOrder.verify(nodeRepository).updateNodeAttributes(
hostName,
new NodeAttributes()
@@ -127,7 +126,24 @@ public class NodeAgentImplTest {
.withRebootGeneration(rebootGeneration)
.withDockerImage(dockerImage)
.withVespaVersion(vespaVersion));
+
+ nodeAgent.converge();
+
+ verify(dockerOperations, never()).removeContainer(any());
+ verify(orchestrator, never()).suspend(any(String.class));
+ verify(dockerOperations, never()).scheduleDownloadOfImage(eq(containerName), any(), any());
+
+ // Should not be called 2nd time
+ inOrder.verify(dockerOperations, never()).resumeNode(eq(containerName));
inOrder.verify(orchestrator).resume(hostName);
+ // Should not be called 2nd time
+ inOrder.verify(nodeRepository, never()).updateNodeAttributes(
+ hostName,
+ new NodeAttributes()
+ .withRestartGeneration(restartGeneration)
+ .withRebootGeneration(rebootGeneration)
+ .withDockerImage(dockerImage)
+ .withVespaVersion(vespaVersion));
}
@Test
@@ -158,13 +174,13 @@ public class NodeAgentImplTest {
inOrder.verify(aclMaintainer, times(1)).run();
inOrder.verify(dockerOperations, times(1)).startContainer(eq(containerName), eq(nodeSpec));
inOrder.verify(dockerOperations, times(1)).resumeNode(eq(containerName));
+ inOrder.verify(orchestrator).resume(hostName);
inOrder.verify(nodeRepository).updateNodeAttributes(
hostName, new NodeAttributes()
.withRestartGeneration(restartGeneration)
.withRebootGeneration(rebootGeneration)
.withDockerImage(dockerImage)
.withVespaVersion(vespaVersion));
- inOrder.verify(orchestrator).resume(hostName);
}
@Test
@@ -463,6 +479,7 @@ public class NodeAgentImplTest {
inOrder.verify(dockerOperations).resumeNode(any());
inOrder.verify(orchestrator).resume(hostName);
+ inOrder.verify(nodeRepository).updateNodeAttributes(eq(hostName), any());
inOrder.verifyNoMoreInteractions();
}