From d9fab8d7d6ca30f21e105a7aabdd054b3bd7058e Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Tue, 25 Sep 2018 13:41:12 +0200 Subject: Fetch containers to run when subsystem freeze expires --- .../admin/nodeadmin/NodeAdminStateUpdaterImpl.java | 24 ++++++++-------------- .../nodeadmin/NodeAdminStateUpdaterImplTest.java | 20 +++++++++++++----- 2 files changed, 23 insertions(+), 21 deletions(-) (limited to 'node-admin') diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java index 5ab73788299..02c144e3545 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java @@ -209,10 +209,11 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { // To avoid node agents stalling for too long, we'll force unfrozen ticks now. log.info("Timed out trying to freeze, will force unfreezed ticks"); nodeAdmin.setFrozen(false); + fetchContainersToRunFromNodeRepository(); } + } else if (currentState == RESUMED) { + fetchContainersToRunFromNodeRepository(); } - - fetchContainersToRunFromNodeRepository(); } private void setLastConvergenceException(RuntimeException exception) { @@ -277,20 +278,11 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { } private void fetchContainersToRunFromNodeRepository() { - synchronized (monitor) { - // Refresh containers to run even if we would like to suspend but have failed to do so yet, - // because it may take a long time to get permission to suspend. - if (currentState != RESUMED) { - log.info("Frozen, skipping fetching info from node repository"); - return; - } - - try { - final List containersToRun = nodeRepository.getNodes(dockerHostHostName); - nodeAdmin.refreshContainersToRun(containersToRun); - } catch (Exception e) { - log.log(LogLevel.WARNING, "Failed to update which containers should be running", e); - } + try { + final List containersToRun = nodeRepository.getNodes(dockerHostHostName); + nodeAdmin.refreshContainersToRun(containersToRun); + } catch (Exception e) { + log.log(LogLevel.WARNING, "Failed to update which containers should be running", e); } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java index cfdb576fb34..fd54773910d 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java @@ -22,6 +22,7 @@ import java.util.stream.IntStream; import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl.TRANSITION_EXCEPTION_MESSAGE; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +import static org.mockito.Matchers.any; import static org.mockito.Matchers.anyBoolean; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.doNothing; @@ -172,19 +173,28 @@ public class NodeAdminStateUpdaterImplTest { // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); - doThrow(new RuntimeException(exceptionMsg)) - .when(orchestrator).suspend(eq(parentHostname)); + doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(parentHostname)); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(0); + tickAfter(30); verify(nodeAdmin, times(1)).setFrozen(eq(true)); + tickAfter(30); + verify(nodeAdmin, times(2)).setFrozen(eq(true)); + verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks + verify(nodeAdmin, times(1)).refreshContainersToRun(any()); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMsg); + // Only resume and fetch containers when subsystem freeze duration expires + when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1)); + tickAfter(30); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); + verify(nodeAdmin, times(2)).refreshContainersToRun(any()); + // We change our mind, want to remain resumed assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(0); + tickAfter(30); refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure that we unfreeze! + verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze! } @Test -- cgit v1.2.3 From 52e965c371a89c20088889cd6456a6a950f2206f Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Tue, 25 Sep 2018 14:30:52 +0200 Subject: Reverse order --- .../vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'node-admin') diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java index 02c144e3545..90e4b2e6c8b 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java @@ -208,8 +208,8 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { // We have spent too much time trying to freeze and node admin is still not frozen. // To avoid node agents stalling for too long, we'll force unfrozen ticks now. log.info("Timed out trying to freeze, will force unfreezed ticks"); - nodeAdmin.setFrozen(false); fetchContainersToRunFromNodeRepository(); + nodeAdmin.setFrozen(false); } } else if (currentState == RESUMED) { fetchContainersToRunFromNodeRepository(); -- cgit v1.2.3