diff options
author | Valerij Fredriksen <valerijf@oath.com> | 2018-10-17 15:45:27 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerijf@oath.com> | 2018-10-17 15:50:42 +0200 |
commit | c1b81088c4b445623f1fb4a64641b565c66c8275 (patch) | |
tree | 2a9a3dd8bb02ffd0dc6d0ac37f22029fda0ed049 /node-admin | |
parent | 0a00751e3a46ce4507e6f417744ffc7adbae8bc2 (diff) |
Remove ClassLocking from NASU
Diffstat (limited to 'node-admin')
2 files changed, 37 insertions, 78 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java index d88174b0370..296745c8e37 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java @@ -1,16 +1,11 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeadmin; -import com.yahoo.concurrent.ThreadFactoryFactory; -import com.yahoo.concurrent.classlock.ClassLock; -import com.yahoo.concurrent.classlock.ClassLocking; -import com.yahoo.concurrent.classlock.LockInterruptException; +import com.yahoo.config.provision.HostName; import com.yahoo.log.LogLevel; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; -import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; -import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeAttributes; import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException; import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater; import com.yahoo.vespa.hosted.node.admin.configserver.HttpException; @@ -25,7 +20,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; @@ -51,52 +45,33 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { private RuntimeException lastConvergenceException; private final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName()); - private final ScheduledExecutorService specVerifierScheduler = - Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("specverifier")); private final Thread loopThread; private final NodeRepository nodeRepository; private final Orchestrator orchestrator; private final NodeAdmin nodeAdmin; private final Clock clock; - private final String dockerHostHostName; + private final String hostHostname; private final Duration nodeAdminConvergeStateInterval; - private final Optional<ClassLocking> classLocking; - private Optional<ClassLock> classLock = Optional.empty(); private Instant lastTick; public NodeAdminStateUpdaterImpl( NodeRepository nodeRepository, Orchestrator orchestrator, - StorageMaintainer storageMaintainer, NodeAdmin nodeAdmin, - String dockerHostHostName, + HostName hostHostname, Clock clock, - Duration nodeAdminConvergeStateInterval, - Optional<ClassLocking> classLocking) { - log.info(objectToString() + ": Creating object"); + Duration nodeAdminConvergeStateInterval) { this.nodeRepository = nodeRepository; this.orchestrator = orchestrator; this.nodeAdmin = nodeAdmin; - this.dockerHostHostName = dockerHostHostName; + this.hostHostname = hostHostname.value(); this.clock = clock; this.nodeAdminConvergeStateInterval = nodeAdminConvergeStateInterval; - this.classLocking = classLocking; this.lastTick = clock.instant(); this.loopThread = new Thread(() -> { - if (classLocking.isPresent()) { - log.info(objectToString() + ": Acquiring lock"); - try { - classLock = Optional.of(classLocking.get().lockWhile(NodeAdminStateUpdater.class, () -> !terminated.get())); - } catch (LockInterruptException e) { - classLock = Optional.empty(); - return; - } - } - - log.info(objectToString() + ": Starting threads and schedulers"); nodeAdmin.start(); while (! terminated.get()) { @@ -106,15 +81,11 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { this.loopThread.setName("tick-NodeAdminStateUpdater"); } - private String objectToString() { - return this.getClass().getSimpleName() + "@" + Integer.toString(System.identityHashCode(this)); - } - @Override public Map<String, Object> getDebugPage() { Map<String, Object> debug = new LinkedHashMap<>(); synchronized (monitor) { - debug.put("dockerHostHostName", dockerHostHostName); + debug.put("hostHostname", hostHostname); debug.put("wantedState", wantedState); debug.put("currentState", currentState); debug.put("NodeAdmin", nodeAdmin.debugInfo()); @@ -215,13 +186,13 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen")); } - boolean hostIsActiveInNR = nodeRepository.getNode(dockerHostHostName).getState() == Node.State.active; + boolean hostIsActiveInNR = nodeRepository.getNode(hostHostname).getState() == Node.State.active; switch (wantedState) { case RESUMED: - if (hostIsActiveInNR) orchestrator.resume(dockerHostHostName); + if (hostIsActiveInNR) orchestrator.resume(hostHostname); break; case SUSPENDED_NODE_ADMIN: - if (hostIsActiveInNR) orchestrator.suspend(dockerHostHostName); + if (hostIsActiveInNR) orchestrator.suspend(hostHostname); break; case SUSPENDED: // Fetch active nodes from node repo before suspending nodes. @@ -233,9 +204,9 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { List<String> nodesInActiveState = getNodesInActiveState(); List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState); - if (hostIsActiveInNR) nodesToSuspend.add(dockerHostHostName); + if (hostIsActiveInNR) nodesToSuspend.add(hostHostname); if (!nodesToSuspend.isEmpty()) { - orchestrator.suspend(dockerHostHostName, nodesToSuspend); + orchestrator.suspend(hostHostname, nodesToSuspend); log.info("Orchestrator allows suspension of " + nodesToSuspend); } @@ -257,7 +228,7 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { private void fetchContainersToRunFromNodeRepository() { try { - final List<NodeSpec> containersToRun = nodeRepository.getNodes(dockerHostHostName); + final List<NodeSpec> containersToRun = nodeRepository.getNodes(hostHostname); nodeAdmin.refreshContainersToRun(containersToRun); } catch (Exception e) { log.log(LogLevel.WARNING, "Failed to update which containers should be running", e); @@ -265,7 +236,7 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { } private List<String> getNodesInActiveState() { - return nodeRepository.getNodes(dockerHostHostName) + return nodeRepository.getNodes(hostHostname) .stream() .filter(node -> node.getState() == Node.State.active) .map(NodeSpec::getHostname) @@ -277,13 +248,10 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { } public void stop() { - log.info(objectToString() + ": Stop called"); if (!terminated.compareAndSet(false, true)) { throw new RuntimeException("Can not re-stop a node agent."); } - classLocking.ifPresent(ClassLocking::interrupt); - // First we need to stop NodeAdminStateUpdaterImpl thread to make sure no new NodeAgents are spawned signalWorkToBeDone(); @@ -297,11 +265,5 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { // Finally, stop NodeAdmin and all the NodeAgents nodeAdmin.stop(); - - classLock.ifPresent(lock -> { - log.info(objectToString() + ": Releasing lock"); - lock.close(); - }); - log.info(objectToString() + ": Stop complete"); } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java index fd54773910d..82f2408b252 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java @@ -1,10 +1,10 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeadmin; +import com.yahoo.config.provision.HostName; import com.yahoo.config.provision.NodeType; import com.yahoo.test.ManualClock; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; -import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException; @@ -15,7 +15,6 @@ import org.junit.Test; import java.time.Duration; import java.util.ArrayList; import java.util.List; -import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -44,33 +43,31 @@ import static org.mockito.Mockito.when; public class NodeAdminStateUpdaterImplTest { private final NodeRepository nodeRepository = mock(NodeRepository.class); private final Orchestrator orchestrator = mock(Orchestrator.class); - private final StorageMaintainer storageMaintainer = mock(StorageMaintainer.class); private final NodeAdmin nodeAdmin = mock(NodeAdmin.class); - private final String parentHostname = "basehost1.test.yahoo.com"; + private final HostName hostHostname = HostName.from("basehost1.test.yahoo.com"); private final ManualClock clock = new ManualClock(); private final Duration convergeStateInterval = Duration.ofSeconds(30); private final NodeAdminStateUpdaterImpl refresher = spy(new NodeAdminStateUpdaterImpl( - nodeRepository, orchestrator, storageMaintainer, nodeAdmin, parentHostname, clock, - convergeStateInterval, Optional.empty())); + nodeRepository, orchestrator, nodeAdmin, hostHostname, clock, convergeStateInterval)); @Test public void testStateConvergence() { mockNodeRepo(Node.State.active, 4); - List<String> activeHostnames = nodeRepository.getNodes(parentHostname).stream() + List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream() .map(NodeSpec::getHostname) .collect(Collectors.toList()); List<String> suspendHostnames = new ArrayList<>(activeHostnames); - suspendHostnames.add(parentHostname); + suspendHostnames.add(hostHostname.value()); // Initially everything is frozen to force convergence assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); - doNothing().when(orchestrator).resume(parentHostname); + doNothing().when(orchestrator).resume(hostHostname.value()); tickAfter(0); // The first tick should unfreeze - verify(orchestrator, times(1)).resume(parentHostname); // Resume host - verify(orchestrator, times(1)).resume(parentHostname); + verify(orchestrator, times(1)).resume(hostHostname.value()); // Resume host + verify(orchestrator, times(1)).resume(hostHostname.value()); // Everything is running and we want to continue running, therefore we have converged refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); @@ -78,7 +75,7 @@ public class NodeAdminStateUpdaterImplTest { tickAfter(35); refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); verify(refresher, never()).signalWorkToBeDone(); // No attempt in changing state - verify(orchestrator, times(1)).resume(parentHostname); // Already resumed + verify(orchestrator, times(1)).resume(hostHostname.value()); // Already resumed // Lets try to suspend node admin only, immediately we get false back, and need to wait until next // tick before any change can happen @@ -100,7 +97,7 @@ public class NodeAdminStateUpdaterImplTest { when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); doThrow(new RuntimeException(exceptionMessage)) - .when(orchestrator).suspend(eq(parentHostname)); + .when(orchestrator).suspend(eq(hostHostname.value())); tickAfter(35); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage); verify(refresher, times(1)).signalWorkToBeDone(); @@ -111,7 +108,7 @@ public class NodeAdminStateUpdaterImplTest { when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); when(nodeAdmin.subsystemFreezeDuration()).thenReturn(NodeAdminStateUpdaterImpl.FREEZE_CONVERGENCE_TIMEOUT.plusMinutes(1)); doThrow(new RuntimeException(exceptionMessage)).doNothing() - .when(orchestrator).suspend(eq(parentHostname)); + .when(orchestrator).suspend(eq(hostHostname.value())); tickAfter(35); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage); verify(refresher, times(1)).signalWorkToBeDone(); @@ -122,12 +119,12 @@ public class NodeAdminStateUpdaterImplTest { verify(nodeAdmin, times(2)).setFrozen(eq(false)); // At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services - verify(orchestrator, times(0)).suspend(eq(parentHostname), eq(suspendHostnames)); + verify(orchestrator, times(0)).suspend(eq(hostHostname.value()), eq(suspendHostnames)); doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames)); when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE); tickAfter(0); // Change in wanted state, no need to wait - verify(orchestrator, times(1)).suspend(eq(parentHostname), eq(suspendHostnames)); + verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(suspendHostnames)); verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state // Make sure we dont roll back if we fail to stop services - we will try to stop again next tick verify(nodeAdmin, times(2)).setFrozen(eq(false)); @@ -150,8 +147,8 @@ public class NodeAdminStateUpdaterImplTest { tickAfter(35); assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "NodeAdmin is not yet unfrozen"); - doThrow(new OrchestratorException("Cannot allow to suspend " + parentHostname)).doNothing() - .when(orchestrator).resume(parentHostname); + doThrow(new OrchestratorException("Cannot allow to suspend " + hostHostname.value())).doNothing() + .when(orchestrator).resume(hostHostname.value()); tickAfter(35); assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "Cannot allow to suspend basehost1.test.yahoo.com"); tickAfter(35); @@ -165,7 +162,7 @@ public class NodeAdminStateUpdaterImplTest { // Initially everything is frozen to force convergence when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); - doNothing().when(orchestrator).resume(parentHostname); + doNothing().when(orchestrator).resume(hostHostname.value()); tickAfter(0); // The first tick should unfreeze refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); verify(nodeAdmin, times(1)).setFrozen(eq(false)); @@ -173,7 +170,7 @@ public class NodeAdminStateUpdaterImplTest { // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); - doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(parentHostname)); + doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(hostHostname.value())); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); tickAfter(30); @@ -207,21 +204,21 @@ public class NodeAdminStateUpdaterImplTest { // orchestrator to resume/suspend host. Therefore, if host is not active, we only need to freeze. tickAfter(0); refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - verify(orchestrator, never()).resume(eq(parentHostname)); + verify(orchestrator, never()).resume(eq(hostHostname.value())); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); tickAfter(0); refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN); - verify(orchestrator, never()).suspend(eq(parentHostname)); + verify(orchestrator, never()).suspend(eq(hostHostname.value())); // When doing batch suspend, only suspend the containers if the host is not active - List<String> activeHostnames = nodeRepository.getNodes(parentHostname).stream() + List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream() .map(NodeSpec::getHostname) .collect(Collectors.toList()); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE); tickAfter(0); refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED); - verify(orchestrator, times(1)).suspend(eq(parentHostname), eq(activeHostnames)); + verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(activeHostnames)); } private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) { @@ -246,10 +243,10 @@ public class NodeAdminStateUpdaterImplTest { .build()) .collect(Collectors.toList()); - when(nodeRepository.getNodes(eq(parentHostname))).thenReturn(containersToRun); + when(nodeRepository.getNodes(eq(hostHostname.value()))).thenReturn(containersToRun); - when(nodeRepository.getNode(eq(parentHostname))).thenReturn(new NodeSpec.Builder() - .hostname(parentHostname) + when(nodeRepository.getNode(eq(hostHostname.value()))).thenReturn(new NodeSpec.Builder() + .hostname(hostHostname.value()) .state(hostState) .nodeType(NodeType.tenant) .flavor("default") |