summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java5
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java24
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java9
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java15
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java20
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java2
7 files changed, 53 insertions, 24 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java
index 4a19e5fe215..cdfefa6a184 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java
@@ -38,7 +38,7 @@ public interface DockerOperations {
/**
* Try to suspend node. Suspending a node means the node should be taken offline,
- * such that maintenance can be done of the node (upgrading, rebooting, etc),
+ * such that maintenance of the node can be done (upgrading, rebooting, etc),
* and such that we will start serving again as soon as possible afterwards.
*/
void trySuspendNode(ContainerName containerName);
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index ba8a2e55587..12c1b9bcf11 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -145,7 +145,10 @@ public class NodeAdminImpl implements NodeAdmin {
// Each container may spend 1-1:30 minutes stopping
nodeAgentsByHostname.values().parallelStream()
.filter(nodeAgent -> hostnames.contains(nodeAgent.getHostname()))
- .forEach(NodeAgent::stopServices);
+ .forEach(nodeAgent -> {
+ nodeAgent.suspend();
+ nodeAgent.stopServices();
+ });
}
public int getNumberOfNodeAgents() {
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
index 5ab73788299..90e4b2e6c8b 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
@@ -208,11 +208,12 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
// We have spent too much time trying to freeze and node admin is still not frozen.
// To avoid node agents stalling for too long, we'll force unfrozen ticks now.
log.info("Timed out trying to freeze, will force unfreezed ticks");
+ fetchContainersToRunFromNodeRepository();
nodeAdmin.setFrozen(false);
}
+ } else if (currentState == RESUMED) {
+ fetchContainersToRunFromNodeRepository();
}
-
- fetchContainersToRunFromNodeRepository();
}
private void setLastConvergenceException(RuntimeException exception) {
@@ -277,20 +278,11 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
}
private void fetchContainersToRunFromNodeRepository() {
- synchronized (monitor) {
- // Refresh containers to run even if we would like to suspend but have failed to do so yet,
- // because it may take a long time to get permission to suspend.
- if (currentState != RESUMED) {
- log.info("Frozen, skipping fetching info from node repository");
- return;
- }
-
- try {
- final List<NodeSpec> containersToRun = nodeRepository.getNodes(dockerHostHostName);
- nodeAdmin.refreshContainersToRun(containersToRun);
- } catch (Exception e) {
- log.log(LogLevel.WARNING, "Failed to update which containers should be running", e);
- }
+ try {
+ final List<NodeSpec> containersToRun = nodeRepository.getNodes(dockerHostHostName);
+ nodeAdmin.refreshContainersToRun(containersToRun);
+ } catch (Exception e) {
+ log.log(LogLevel.WARNING, "Failed to update which containers should be running", e);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
index 92c44969d5e..9b759b208eb 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
@@ -18,9 +18,18 @@ public interface NodeAgent {
*/
boolean setFrozen(boolean frozen);
+ /**
+ * Stop services running on node. Depending on the state of the node, {@link #suspend()} might need to be
+ * called before calling this method.
+ */
void stopServices();
/**
+ * Suspend node. Take node offline (e.g. take node out of VIP, drain traffic, prepare for restart etc.)
+ */
+ void suspend();
+
+ /**
* Returns a map containing all relevant NodeAgent variables and their current values.
*/
Map<String, Object> debugInfo();
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index ad38306547d..43e3051ce84 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -302,13 +302,23 @@ public class NodeAgentImpl implements NodeAgent {
logger.info("Stopping services");
if (containerState == ABSENT) return;
try {
- dockerOperations.trySuspendNode(containerName);
dockerOperations.stopServicesOnNode(containerName);
} catch (ContainerNotFoundException e) {
containerState = ABSENT;
}
}
+ @Override
+ public void suspend() {
+ logger.info("Suspending services on node");
+ if (containerState == ABSENT) return;
+ try {
+ dockerOperations.trySuspendNode(containerName);
+ } catch (ContainerNotFoundException e) {
+ containerState = ABSENT;
+ }
+ }
+
private Optional<String> shouldRemoveContainer(NodeSpec node, Container existingContainer) {
final Node.State nodeState = node.getState();
if (nodeState == Node.State.dirty || nodeState == Node.State.provisioned) {
@@ -344,6 +354,9 @@ public class NodeAgentImpl implements NodeAgent {
}
try {
+ if (node.getState() != Node.State.dirty) {
+ suspend();
+ }
stopServices();
} catch (Exception e) {
logger.info("Failed stopping services, ignoring", e);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
index cfdb576fb34..fd54773910d 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
@@ -22,6 +22,7 @@ import java.util.stream.IntStream;
import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl.TRANSITION_EXCEPTION_MESSAGE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
+import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyBoolean;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doNothing;
@@ -172,19 +173,28 @@ public class NodeAdminStateUpdaterImplTest {
// Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
- doThrow(new RuntimeException(exceptionMsg))
- .when(orchestrator).suspend(eq(parentHostname));
+ doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(parentHostname));
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(0);
+ tickAfter(30);
verify(nodeAdmin, times(1)).setFrozen(eq(true));
+ tickAfter(30);
+ verify(nodeAdmin, times(2)).setFrozen(eq(true));
+ verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks
+ verify(nodeAdmin, times(1)).refreshContainersToRun(any());
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMsg);
+ // Only resume and fetch containers when subsystem freeze duration expires
+ when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1));
+ tickAfter(30);
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
+ verify(nodeAdmin, times(2)).refreshContainersToRun(any());
+
// We change our mind, want to remain resumed
assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(0);
+ tickAfter(30);
refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure that we unfreeze!
+ verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze!
}
@Test
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index e0031f9b9b3..28a2b8ff5e0 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -426,6 +426,8 @@ public class NodeAgentImplTest {
verify(dockerOperations, never()).createContainer(eq(containerName), any(), any());
verify(dockerOperations, never()).startContainer(eq(containerName));
+ verify(dockerOperations, never()).trySuspendNode(eq(containerName));
+ verify(dockerOperations, times(1)).stopServicesOnNode(eq(containerName));
verify(orchestrator, never()).resume(any(String.class));
verify(orchestrator, never()).suspend(any(String.class));
// current Docker image and vespa version should be cleared