aboutsummaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorhakonhall <hakon@yahoo-inc.com>2017-06-28 00:06:08 +0200
committerGitHub <noreply@github.com>2017-06-28 00:06:08 +0200
commit98ef3fbb6d57fd4a12e169c258e801d8f5f6c3e8 (patch)
tree7e4d65cbd872983a7ce51154963f79a1c796e37c /node-admin
parent1429f2d600dc9f68f521769fe0a20ab24cee23d2 (diff)
Revert "Resume before updating node repo"
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java18
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java25
2 files changed, 16 insertions, 27 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index de8f2f183ca..8169b9daee8 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -467,13 +467,19 @@ public class NodeAgentImpl implements NodeAgent {
}
runLocalResumeScriptIfNeeded(nodeSpec);
-
- orchestrator.resume(hostname);
-
- // Updating the node repo is best-effort: Since resume has succeeded when getting here,
- // suspension of the next node wil be granted. However, the rollout of a new release IS
- // gated on this succeeding.
+ // Because it's more important to stop a bad release from rolling out in prod,
+ // we put the resume call last. So if we fail after updating the node repo attributes
+ // but before resume, the app may go through the tenant pipeline but will halt in prod.
+ //
+ // Note that this problem exists only because there are 2 different mechanisms
+ // that should really be parts of a single mechanism:
+ // - The content of node repo is used to determine whether a new Vespa+application
+ // has been successfully rolled out.
+ // - Slobrok and internal orchestrator state is used to determine whether
+ // to allow upgrade (suspend).
updateNodeRepoWithCurrentAttributes(nodeSpec);
+ logger.info("Call resume against Orchestrator");
+ orchestrator.resume(hostname);
break;
case inactive:
storageMaintainer.ifPresent(maintainer -> maintainer.removeOldFilesFromNode(containerName));
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 0f5721a5d75..0264e2ac102 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -117,8 +117,9 @@ public class NodeAgentImplTest {
verify(dockerOperations, never()).scheduleDownloadOfImage(eq(containerName), any(), any());
final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository);
- inOrder.verify(dockerOperations).resumeNode(eq(containerName));
- inOrder.verify(orchestrator).resume(hostName);
+ // TODO: Verify this isn't run unless 1st time
+ inOrder.verify(dockerOperations, times(1)).resumeNode(eq(containerName));
+ // TODO: This should not happen when nothing is changed. Now it happens 1st time through.
inOrder.verify(nodeRepository).updateNodeAttributes(
hostName,
new NodeAttributes()
@@ -126,24 +127,7 @@ public class NodeAgentImplTest {
.withRebootGeneration(rebootGeneration)
.withDockerImage(dockerImage)
.withVespaVersion(vespaVersion));
-
- nodeAgent.converge();
-
- verify(dockerOperations, never()).removeContainer(any());
- verify(orchestrator, never()).suspend(any(String.class));
- verify(dockerOperations, never()).scheduleDownloadOfImage(eq(containerName), any(), any());
-
- // Should not be called 2nd time
- inOrder.verify(dockerOperations, never()).resumeNode(eq(containerName));
inOrder.verify(orchestrator).resume(hostName);
- // Should not be called 2nd time
- inOrder.verify(nodeRepository, never()).updateNodeAttributes(
- hostName,
- new NodeAttributes()
- .withRestartGeneration(restartGeneration)
- .withRebootGeneration(rebootGeneration)
- .withDockerImage(dockerImage)
- .withVespaVersion(vespaVersion));
}
@Test
@@ -174,13 +158,13 @@ public class NodeAgentImplTest {
inOrder.verify(aclMaintainer, times(1)).run();
inOrder.verify(dockerOperations, times(1)).startContainer(eq(containerName), eq(nodeSpec));
inOrder.verify(dockerOperations, times(1)).resumeNode(eq(containerName));
- inOrder.verify(orchestrator).resume(hostName);
inOrder.verify(nodeRepository).updateNodeAttributes(
hostName, new NodeAttributes()
.withRestartGeneration(restartGeneration)
.withRebootGeneration(rebootGeneration)
.withDockerImage(dockerImage)
.withVespaVersion(vespaVersion));
+ inOrder.verify(orchestrator).resume(hostName);
}
@Test
@@ -479,7 +463,6 @@ public class NodeAgentImplTest {
inOrder.verify(dockerOperations).resumeNode(any());
inOrder.verify(orchestrator).resume(hostName);
- inOrder.verify(nodeRepository).updateNodeAttributes(eq(hostName), any());
inOrder.verifyNoMoreInteractions();
}