summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@oath.com>2018-10-17 15:45:27 +0200
committerValerij Fredriksen <valerijf@oath.com>2018-10-17 15:50:42 +0200
commitc1b81088c4b445623f1fb4a64641b565c66c8275 (patch)
tree2a9a3dd8bb02ffd0dc6d0ac37f22029fda0ed049 /node-admin
parent0a00751e3a46ce4507e6f417744ffc7adbae8bc2 (diff)
Remove ClassLocking from NASU
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java64
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java51
2 files changed, 37 insertions, 78 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
index d88174b0370..296745c8e37 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
@@ -1,16 +1,11 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.nodeadmin;
-import com.yahoo.concurrent.ThreadFactoryFactory;
-import com.yahoo.concurrent.classlock.ClassLock;
-import com.yahoo.concurrent.classlock.ClassLocking;
-import com.yahoo.concurrent.classlock.LockInterruptException;
+import com.yahoo.config.provision.HostName;
import com.yahoo.log.LogLevel;
import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository;
import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator;
-import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer;
-import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeAttributes;
import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException;
import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater;
import com.yahoo.vespa.hosted.node.admin.configserver.HttpException;
@@ -25,7 +20,6 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
@@ -51,52 +45,33 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
private RuntimeException lastConvergenceException;
private final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName());
- private final ScheduledExecutorService specVerifierScheduler =
- Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("specverifier"));
private final Thread loopThread;
private final NodeRepository nodeRepository;
private final Orchestrator orchestrator;
private final NodeAdmin nodeAdmin;
private final Clock clock;
- private final String dockerHostHostName;
+ private final String hostHostname;
private final Duration nodeAdminConvergeStateInterval;
- private final Optional<ClassLocking> classLocking;
- private Optional<ClassLock> classLock = Optional.empty();
private Instant lastTick;
public NodeAdminStateUpdaterImpl(
NodeRepository nodeRepository,
Orchestrator orchestrator,
- StorageMaintainer storageMaintainer,
NodeAdmin nodeAdmin,
- String dockerHostHostName,
+ HostName hostHostname,
Clock clock,
- Duration nodeAdminConvergeStateInterval,
- Optional<ClassLocking> classLocking) {
- log.info(objectToString() + ": Creating object");
+ Duration nodeAdminConvergeStateInterval) {
this.nodeRepository = nodeRepository;
this.orchestrator = orchestrator;
this.nodeAdmin = nodeAdmin;
- this.dockerHostHostName = dockerHostHostName;
+ this.hostHostname = hostHostname.value();
this.clock = clock;
this.nodeAdminConvergeStateInterval = nodeAdminConvergeStateInterval;
- this.classLocking = classLocking;
this.lastTick = clock.instant();
this.loopThread = new Thread(() -> {
- if (classLocking.isPresent()) {
- log.info(objectToString() + ": Acquiring lock");
- try {
- classLock = Optional.of(classLocking.get().lockWhile(NodeAdminStateUpdater.class, () -> !terminated.get()));
- } catch (LockInterruptException e) {
- classLock = Optional.empty();
- return;
- }
- }
-
- log.info(objectToString() + ": Starting threads and schedulers");
nodeAdmin.start();
while (! terminated.get()) {
@@ -106,15 +81,11 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
this.loopThread.setName("tick-NodeAdminStateUpdater");
}
- private String objectToString() {
- return this.getClass().getSimpleName() + "@" + Integer.toString(System.identityHashCode(this));
- }
-
@Override
public Map<String, Object> getDebugPage() {
Map<String, Object> debug = new LinkedHashMap<>();
synchronized (monitor) {
- debug.put("dockerHostHostName", dockerHostHostName);
+ debug.put("hostHostname", hostHostname);
debug.put("wantedState", wantedState);
debug.put("currentState", currentState);
debug.put("NodeAdmin", nodeAdmin.debugInfo());
@@ -215,13 +186,13 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen"));
}
- boolean hostIsActiveInNR = nodeRepository.getNode(dockerHostHostName).getState() == Node.State.active;
+ boolean hostIsActiveInNR = nodeRepository.getNode(hostHostname).getState() == Node.State.active;
switch (wantedState) {
case RESUMED:
- if (hostIsActiveInNR) orchestrator.resume(dockerHostHostName);
+ if (hostIsActiveInNR) orchestrator.resume(hostHostname);
break;
case SUSPENDED_NODE_ADMIN:
- if (hostIsActiveInNR) orchestrator.suspend(dockerHostHostName);
+ if (hostIsActiveInNR) orchestrator.suspend(hostHostname);
break;
case SUSPENDED:
// Fetch active nodes from node repo before suspending nodes.
@@ -233,9 +204,9 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
List<String> nodesInActiveState = getNodesInActiveState();
List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState);
- if (hostIsActiveInNR) nodesToSuspend.add(dockerHostHostName);
+ if (hostIsActiveInNR) nodesToSuspend.add(hostHostname);
if (!nodesToSuspend.isEmpty()) {
- orchestrator.suspend(dockerHostHostName, nodesToSuspend);
+ orchestrator.suspend(hostHostname, nodesToSuspend);
log.info("Orchestrator allows suspension of " + nodesToSuspend);
}
@@ -257,7 +228,7 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
private void fetchContainersToRunFromNodeRepository() {
try {
- final List<NodeSpec> containersToRun = nodeRepository.getNodes(dockerHostHostName);
+ final List<NodeSpec> containersToRun = nodeRepository.getNodes(hostHostname);
nodeAdmin.refreshContainersToRun(containersToRun);
} catch (Exception e) {
log.log(LogLevel.WARNING, "Failed to update which containers should be running", e);
@@ -265,7 +236,7 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
}
private List<String> getNodesInActiveState() {
- return nodeRepository.getNodes(dockerHostHostName)
+ return nodeRepository.getNodes(hostHostname)
.stream()
.filter(node -> node.getState() == Node.State.active)
.map(NodeSpec::getHostname)
@@ -277,13 +248,10 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
}
public void stop() {
- log.info(objectToString() + ": Stop called");
if (!terminated.compareAndSet(false, true)) {
throw new RuntimeException("Can not re-stop a node agent.");
}
- classLocking.ifPresent(ClassLocking::interrupt);
-
// First we need to stop NodeAdminStateUpdaterImpl thread to make sure no new NodeAgents are spawned
signalWorkToBeDone();
@@ -297,11 +265,5 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
// Finally, stop NodeAdmin and all the NodeAgents
nodeAdmin.stop();
-
- classLock.ifPresent(lock -> {
- log.info(objectToString() + ": Releasing lock");
- lock.close();
- });
- log.info(objectToString() + ": Stop complete");
}
}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
index fd54773910d..82f2408b252 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
@@ -1,10 +1,10 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.nodeadmin;
+import com.yahoo.config.provision.HostName;
import com.yahoo.config.provision.NodeType;
import com.yahoo.test.ManualClock;
import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
-import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer;
import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository;
import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator;
import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException;
@@ -15,7 +15,6 @@ import org.junit.Test;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
-import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@@ -44,33 +43,31 @@ import static org.mockito.Mockito.when;
public class NodeAdminStateUpdaterImplTest {
private final NodeRepository nodeRepository = mock(NodeRepository.class);
private final Orchestrator orchestrator = mock(Orchestrator.class);
- private final StorageMaintainer storageMaintainer = mock(StorageMaintainer.class);
private final NodeAdmin nodeAdmin = mock(NodeAdmin.class);
- private final String parentHostname = "basehost1.test.yahoo.com";
+ private final HostName hostHostname = HostName.from("basehost1.test.yahoo.com");
private final ManualClock clock = new ManualClock();
private final Duration convergeStateInterval = Duration.ofSeconds(30);
private final NodeAdminStateUpdaterImpl refresher = spy(new NodeAdminStateUpdaterImpl(
- nodeRepository, orchestrator, storageMaintainer, nodeAdmin, parentHostname, clock,
- convergeStateInterval, Optional.empty()));
+ nodeRepository, orchestrator, nodeAdmin, hostHostname, clock, convergeStateInterval));
@Test
public void testStateConvergence() {
mockNodeRepo(Node.State.active, 4);
- List<String> activeHostnames = nodeRepository.getNodes(parentHostname).stream()
+ List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream()
.map(NodeSpec::getHostname)
.collect(Collectors.toList());
List<String> suspendHostnames = new ArrayList<>(activeHostnames);
- suspendHostnames.add(parentHostname);
+ suspendHostnames.add(hostHostname.value());
// Initially everything is frozen to force convergence
assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
- doNothing().when(orchestrator).resume(parentHostname);
+ doNothing().when(orchestrator).resume(hostHostname.value());
tickAfter(0); // The first tick should unfreeze
- verify(orchestrator, times(1)).resume(parentHostname); // Resume host
- verify(orchestrator, times(1)).resume(parentHostname);
+ verify(orchestrator, times(1)).resume(hostHostname.value()); // Resume host
+ verify(orchestrator, times(1)).resume(hostHostname.value());
// Everything is running and we want to continue running, therefore we have converged
refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
@@ -78,7 +75,7 @@ public class NodeAdminStateUpdaterImplTest {
tickAfter(35);
refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
verify(refresher, never()).signalWorkToBeDone(); // No attempt in changing state
- verify(orchestrator, times(1)).resume(parentHostname); // Already resumed
+ verify(orchestrator, times(1)).resume(hostHostname.value()); // Already resumed
// Lets try to suspend node admin only, immediately we get false back, and need to wait until next
// tick before any change can happen
@@ -100,7 +97,7 @@ public class NodeAdminStateUpdaterImplTest {
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
doThrow(new RuntimeException(exceptionMessage))
- .when(orchestrator).suspend(eq(parentHostname));
+ .when(orchestrator).suspend(eq(hostHostname.value()));
tickAfter(35);
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage);
verify(refresher, times(1)).signalWorkToBeDone();
@@ -111,7 +108,7 @@ public class NodeAdminStateUpdaterImplTest {
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(NodeAdminStateUpdaterImpl.FREEZE_CONVERGENCE_TIMEOUT.plusMinutes(1));
doThrow(new RuntimeException(exceptionMessage)).doNothing()
- .when(orchestrator).suspend(eq(parentHostname));
+ .when(orchestrator).suspend(eq(hostHostname.value()));
tickAfter(35);
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage);
verify(refresher, times(1)).signalWorkToBeDone();
@@ -122,12 +119,12 @@ public class NodeAdminStateUpdaterImplTest {
verify(nodeAdmin, times(2)).setFrozen(eq(false));
// At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services
- verify(orchestrator, times(0)).suspend(eq(parentHostname), eq(suspendHostnames));
+ verify(orchestrator, times(0)).suspend(eq(hostHostname.value()), eq(suspendHostnames));
doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames));
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE);
tickAfter(0); // Change in wanted state, no need to wait
- verify(orchestrator, times(1)).suspend(eq(parentHostname), eq(suspendHostnames));
+ verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(suspendHostnames));
verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
// Make sure we dont roll back if we fail to stop services - we will try to stop again next tick
verify(nodeAdmin, times(2)).setFrozen(eq(false));
@@ -150,8 +147,8 @@ public class NodeAdminStateUpdaterImplTest {
tickAfter(35);
assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "NodeAdmin is not yet unfrozen");
- doThrow(new OrchestratorException("Cannot allow to suspend " + parentHostname)).doNothing()
- .when(orchestrator).resume(parentHostname);
+ doThrow(new OrchestratorException("Cannot allow to suspend " + hostHostname.value())).doNothing()
+ .when(orchestrator).resume(hostHostname.value());
tickAfter(35);
assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "Cannot allow to suspend basehost1.test.yahoo.com");
tickAfter(35);
@@ -165,7 +162,7 @@ public class NodeAdminStateUpdaterImplTest {
// Initially everything is frozen to force convergence
when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
- doNothing().when(orchestrator).resume(parentHostname);
+ doNothing().when(orchestrator).resume(hostHostname.value());
tickAfter(0); // The first tick should unfreeze
refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
verify(nodeAdmin, times(1)).setFrozen(eq(false));
@@ -173,7 +170,7 @@ public class NodeAdminStateUpdaterImplTest {
// Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
- doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(parentHostname));
+ doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(hostHostname.value()));
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
tickAfter(30);
@@ -207,21 +204,21 @@ public class NodeAdminStateUpdaterImplTest {
// orchestrator to resume/suspend host. Therefore, if host is not active, we only need to freeze.
tickAfter(0);
refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- verify(orchestrator, never()).resume(eq(parentHostname));
+ verify(orchestrator, never()).resume(eq(hostHostname.value()));
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
tickAfter(0);
refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN);
- verify(orchestrator, never()).suspend(eq(parentHostname));
+ verify(orchestrator, never()).suspend(eq(hostHostname.value()));
// When doing batch suspend, only suspend the containers if the host is not active
- List<String> activeHostnames = nodeRepository.getNodes(parentHostname).stream()
+ List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream()
.map(NodeSpec::getHostname)
.collect(Collectors.toList());
assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE);
tickAfter(0);
refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED);
- verify(orchestrator, times(1)).suspend(eq(parentHostname), eq(activeHostnames));
+ verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(activeHostnames));
}
private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) {
@@ -246,10 +243,10 @@ public class NodeAdminStateUpdaterImplTest {
.build())
.collect(Collectors.toList());
- when(nodeRepository.getNodes(eq(parentHostname))).thenReturn(containersToRun);
+ when(nodeRepository.getNodes(eq(hostHostname.value()))).thenReturn(containersToRun);
- when(nodeRepository.getNode(eq(parentHostname))).thenReturn(new NodeSpec.Builder()
- .hostname(parentHostname)
+ when(nodeRepository.getNode(eq(hostHostname.value()))).thenReturn(new NodeSpec.Builder()
+ .hostname(hostHostname.value())
.state(hostState)
.nodeType(NodeType.tenant)
.flavor("default")