summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2018-09-25 14:33:35 +0200
committerGitHub <noreply@github.com>2018-09-25 14:33:35 +0200
commit7319c973253ca07ceb530e36688d8b034022cc8a (patch)
tree70654370164f0c1b323a73bda2833eec99be9a19 /node-admin
parent213afb57f3f20b9d77ae3e8c465745e91c84f744 (diff)
parent52e965c371a89c20088889cd6456a6a950f2206f (diff)
Merge pull request #7084 from vespa-engine/freva/fetch-containers-when-subsystem-freeze-expires
NodeAdmin: Fetch containers to run when subsystem freeze expires
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java24
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java20
2 files changed, 23 insertions, 21 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
index 5ab73788299..90e4b2e6c8b 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
@@ -208,11 +208,12 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
// We have spent too much time trying to freeze and node admin is still not frozen.
// To avoid node agents stalling for too long, we'll force unfrozen ticks now.
log.info("Timed out trying to freeze, will force unfreezed ticks");
+ fetchContainersToRunFromNodeRepository();
nodeAdmin.setFrozen(false);
}
+ } else if (currentState == RESUMED) {
+ fetchContainersToRunFromNodeRepository();
}
-
- fetchContainersToRunFromNodeRepository();
}
private void setLastConvergenceException(RuntimeException exception) {
@@ -277,20 +278,11 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
}
private void fetchContainersToRunFromNodeRepository() {
- synchronized (monitor) {
- // Refresh containers to run even if we would like to suspend but have failed to do so yet,
- // because it may take a long time to get permission to suspend.
- if (currentState != RESUMED) {
- log.info("Frozen, skipping fetching info from node repository");
- return;
- }
-
- try {
- final List<NodeSpec> containersToRun = nodeRepository.getNodes(dockerHostHostName);
- nodeAdmin.refreshContainersToRun(containersToRun);
- } catch (Exception e) {
- log.log(LogLevel.WARNING, "Failed to update which containers should be running", e);
- }
+ try {
+ final List<NodeSpec> containersToRun = nodeRepository.getNodes(dockerHostHostName);
+ nodeAdmin.refreshContainersToRun(containersToRun);
+ } catch (Exception e) {
+ log.log(LogLevel.WARNING, "Failed to update which containers should be running", e);
}
}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
index cfdb576fb34..fd54773910d 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
@@ -22,6 +22,7 @@ import java.util.stream.IntStream;
import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl.TRANSITION_EXCEPTION_MESSAGE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
+import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyBoolean;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doNothing;
@@ -172,19 +173,28 @@ public class NodeAdminStateUpdaterImplTest {
// Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
- doThrow(new RuntimeException(exceptionMsg))
- .when(orchestrator).suspend(eq(parentHostname));
+ doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(parentHostname));
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(0);
+ tickAfter(30);
verify(nodeAdmin, times(1)).setFrozen(eq(true));
+ tickAfter(30);
+ verify(nodeAdmin, times(2)).setFrozen(eq(true));
+ verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks
+ verify(nodeAdmin, times(1)).refreshContainersToRun(any());
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMsg);
+ // Only resume and fetch containers when subsystem freeze duration expires
+ when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1));
+ tickAfter(30);
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
+ verify(nodeAdmin, times(2)).refreshContainersToRun(any());
+
// We change our mind, want to remain resumed
assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(0);
+ tickAfter(30);
refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure that we unfreeze!
+ verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze!
}
@Test