diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2019-01-10 15:02:33 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-01-10 15:02:33 +0100 |
commit | dae4281d6f6d2c3eece000d052788e147c9f7580 (patch) | |
tree | a41826946a0aebca60b6167a43348851484312c5 /node-admin/src | |
parent | 7a8c964cbc167b617a6c5feb8c62b7585d7e542a (diff) | |
parent | aaeabd1eb297db6a64b4fc013864304ab96e6ded (diff) |
Merge pull request #8082 from vespa-engine/freva/node-agent-context-supplier
Node agent context supplier
Diffstat (limited to 'node-admin/src')
18 files changed, 733 insertions, 446 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java index c65d59a79dc..af8dfb1fd27 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java @@ -7,7 +7,6 @@ import com.yahoo.vespa.hosted.dockerapi.ContainerStats; import com.yahoo.vespa.hosted.dockerapi.DockerImage; import com.yahoo.vespa.hosted.dockerapi.ProcessResult; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; -import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.nodeagent.ContainerData; import java.util.List; @@ -15,7 +14,7 @@ import java.util.Optional; public interface DockerOperations { - void createContainer(NodeAgentContext context, NodeSpec node, ContainerData containerData); + void createContainer(NodeAgentContext context, ContainerData containerData); void startContainer(NodeAgentContext context); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java index 89ab2e60b63..e1b77b6a41b 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java @@ -13,7 +13,6 @@ import com.yahoo.vespa.hosted.dockerapi.Docker; import com.yahoo.vespa.hosted.dockerapi.DockerImage; import com.yahoo.vespa.hosted.dockerapi.ProcessResult; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; -import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.nodeagent.ContainerData; import com.yahoo.vespa.hosted.node.admin.task.util.network.IPAddresses; @@ -55,19 +54,19 @@ public class DockerOperationsImpl implements DockerOperations { } @Override - public void createContainer(NodeAgentContext context, NodeSpec node, ContainerData containerData) { + public void createContainer(NodeAgentContext context, ContainerData containerData) { context.log(logger, "Creating container"); // IPv6 - Assume always valid - Inet6Address ipV6Address = ipAddresses.getIPv6Address(node.getHostname()).orElseThrow( - () -> new RuntimeException("Unable to find a valid IPv6 address for " + node.getHostname() + + Inet6Address ipV6Address = ipAddresses.getIPv6Address(context.node().getHostname()).orElseThrow( + () -> new RuntimeException("Unable to find a valid IPv6 address for " + context.node().getHostname() + ". Missing an AAAA DNS entry?")); Docker.CreateContainerCommand command = docker.createContainerCommand( - node.getWantedDockerImage().get(), - ContainerResources.from(node.getMinCpuCores(), node.getMinMainMemoryAvailableGb()), + context.node().getWantedDockerImage().get(), + ContainerResources.from(context.node().getMinCpuCores(), context.node().getMinMainMemoryAvailableGb()), context.containerName(), - node.getHostname()) + context.node().getHostname()) .withManagedBy(MANAGER_NAME) .withUlimit("nofile", 262_144, 262_144) // The nproc aka RLIMIT_NPROC resource limit works as follows: @@ -100,20 +99,20 @@ public class DockerOperationsImpl implements DockerOperations { command.withIpAddress(ipV6Local); // IPv4 - Only present for some containers - Optional<InetAddress> ipV4Local = ipAddresses.getIPv4Address(node.getHostname()) + Optional<InetAddress> ipV4Local = ipAddresses.getIPv4Address(context.node().getHostname()) .map(ipV4Address -> { InetAddress ipV4Prefix = InetAddresses.forString(IPV4_NPT_PREFIX); return IPAddresses.prefixTranslate(ipV4Address, ipV4Prefix, 2); }); ipV4Local.ifPresent(command::withIpAddress); - addEtcHosts(containerData, node.getHostname(), ipV4Local, ipV6Local); + addEtcHosts(containerData, context.node().getHostname(), ipV4Local, ipV6Local); } addMounts(context, command); // TODO: Enforce disk constraints - long minMainMemoryAvailableMb = (long) (node.getMinMainMemoryAvailableGb() * 1024); + long minMainMemoryAvailableMb = (long) (context.node().getMinMainMemoryAvailableGb() * 1024); if (minMainMemoryAvailableMb > 0) { // VESPA_TOTAL_MEMORY_MB is used to make any jdisc container think the machine // only has this much physical memory (overrides total memory reported by `free -m`). diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index 2fd40a1b486..0cd50a649b1 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -7,7 +7,6 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.log.LogLevel; import com.yahoo.vespa.hosted.dockerapi.Container; import com.yahoo.vespa.hosted.node.admin.component.TaskContext; -import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.docker.DockerOperations; import com.yahoo.vespa.hosted.node.admin.maintenance.coredump.CoredumpHandler; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; @@ -65,9 +64,9 @@ public class StorageMaintainer { this.archiveContainerStoragePath = archiveContainerStoragePath; } - public void writeMetricsConfig(NodeAgentContext context, NodeSpec node) { + public void writeMetricsConfig(NodeAgentContext context) { List<SecretAgentCheckConfig> configs = new ArrayList<>(); - Map<String, Object> tags = generateTags(context, node); + Map<String, Object> tags = generateTags(context); // host-life Path hostLifeCheckPath = context.pathInNodeUnderVespaHome("libexec/yms/yms_check_host_life"); @@ -154,26 +153,26 @@ public class StorageMaintainer { dockerOperations.executeCommandInContainerAsRoot(context, "service", "yamas-agent", "restart"); } - private Map<String, Object> generateTags(NodeAgentContext context, NodeSpec node) { + private Map<String, Object> generateTags(NodeAgentContext context) { Map<String, String> tags = new LinkedHashMap<>(); tags.put("namespace", "Vespa"); - tags.put("role", nodeTypeToRole(node.getNodeType())); + tags.put("role", nodeTypeToRole(context.node().getNodeType())); tags.put("zone", String.format("%s.%s", context.zoneId().environment().value(), context.zoneId().regionName().value())); - node.getVespaVersion().ifPresent(version -> tags.put("vespaVersion", version)); + context.node().getVespaVersion().ifPresent(version -> tags.put("vespaVersion", version)); if (! isConfigserverLike(context.nodeType())) { - tags.put("flavor", node.getFlavor()); - tags.put("canonicalFlavor", node.getCanonicalFlavor()); - tags.put("state", node.getState().toString()); - node.getParentHostname().ifPresent(parent -> tags.put("parentHostname", parent)); - node.getOwner().ifPresent(owner -> { + tags.put("flavor", context.node().getFlavor()); + tags.put("canonicalFlavor", context.node().getCanonicalFlavor()); + tags.put("state", context.node().getState().toString()); + context.node().getParentHostname().ifPresent(parent -> tags.put("parentHostname", parent)); + context.node().getOwner().ifPresent(owner -> { tags.put("tenantName", owner.getTenant()); tags.put("app", owner.getApplication() + "." + owner.getInstance()); tags.put("applicationName", owner.getApplication()); tags.put("instanceName", owner.getInstance()); tags.put("applicationId", owner.getTenant() + "." + owner.getApplication() + "." + owner.getInstance()); }); - node.getMembership().ifPresent(membership -> { + context.node().getMembership().ifPresent(membership -> { tags.put("clustertype", membership.getClusterType()); tags.put("clusterid", membership.getClusterId()); }); @@ -253,23 +252,23 @@ public class StorageMaintainer { } /** Checks if container has any new coredumps, reports and archives them if so */ - public void handleCoreDumpsForContainer(NodeAgentContext context, NodeSpec node, Optional<Container> container) { - final Map<String, Object> nodeAttributes = getCoredumpNodeAttributes(context, node, container); + public void handleCoreDumpsForContainer(NodeAgentContext context, Optional<Container> container) { + final Map<String, Object> nodeAttributes = getCoredumpNodeAttributes(context, container); coredumpHandler.converge(context, nodeAttributes); } - private Map<String, Object> getCoredumpNodeAttributes(NodeAgentContext context, NodeSpec node, Optional<Container> container) { + private Map<String, Object> getCoredumpNodeAttributes(NodeAgentContext context, Optional<Container> container) { Map<String, String> attributes = new HashMap<>(); - attributes.put("hostname", node.getHostname()); + attributes.put("hostname", context.node().getHostname()); attributes.put("region", context.zoneId().regionName().value()); attributes.put("environment", context.zoneId().environment().value()); - attributes.put("flavor", node.getFlavor()); + attributes.put("flavor", context.node().getFlavor()); attributes.put("kernel_version", System.getProperty("os.version")); container.map(c -> c.image).ifPresent(image -> attributes.put("docker_image", image.asString())); - node.getParentHostname().ifPresent(parent -> attributes.put("parent_hostname", parent)); - node.getVespaVersion().ifPresent(version -> attributes.put("vespa_version", version)); - node.getOwner().ifPresent(owner -> { + context.node().getParentHostname().ifPresent(parent -> attributes.put("parent_hostname", parent)); + context.node().getVespaVersion().ifPresent(version -> attributes.put("vespa_version", version)); + context.node().getOwner().ifPresent(owner -> { attributes.put("tenant", owner.getTenant()); attributes.put("application", owner.getApplication()); attributes.put("instance", owner.getInstance()); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index b191401b8e0..2303f78217c 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -9,6 +9,11 @@ import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.maintenance.acl.AclMaintainer; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgent; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextFactory; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextManager; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentFactory; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentScheduler; import com.yahoo.vespa.hosted.node.admin.util.PrefixLogger; import java.time.Clock; @@ -23,7 +28,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import java.util.function.Function; import java.util.stream.Collectors; /** @@ -33,12 +37,15 @@ import java.util.stream.Collectors; */ public class NodeAdminImpl implements NodeAdmin { private static final PrefixLogger logger = PrefixLogger.getNodeAdminLogger(NodeAdmin.class); + private static final Duration NODE_AGENT_FREEZE_TIMEOUT = Duration.ofSeconds(5); + private final ScheduledExecutorService aclScheduler = Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("aclscheduler")); private final ScheduledExecutorService metricsScheduler = Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("metricsscheduler")); - private final Function<String, NodeAgent> nodeAgentFactory; + private final NodeAgentWithSchedulerFactory nodeAgentWithSchedulerFactory; + private final NodeAgentContextFactory nodeAgentContextFactory; private final Optional<AclMaintainer> aclMaintainer; private final Clock clock; @@ -46,16 +53,27 @@ public class NodeAdminImpl implements NodeAdmin { private boolean isFrozen; private Instant startOfFreezeConvergence; - private final Map<String, NodeAgent> nodeAgentsByHostname = new ConcurrentHashMap<>(); + private final Map<String, NodeAgentWithScheduler> nodeAgentWithSchedulerByHostname = new ConcurrentHashMap<>(); private final GaugeWrapper numberOfContainersInLoadImageState; private final CounterWrapper numberOfUnhandledExceptionsInNodeAgent; - public NodeAdminImpl(Function<String, NodeAgent> nodeAgentFactory, + public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, + NodeAgentContextFactory nodeAgentContextFactory, Optional<AclMaintainer> aclMaintainer, MetricReceiverWrapper metricReceiver, Clock clock) { - this.nodeAgentFactory = nodeAgentFactory; + this((NodeAgentWithSchedulerFactory) nodeAgentContext -> create(clock, nodeAgentFactory, nodeAgentContext), + nodeAgentContextFactory, aclMaintainer, metricReceiver, clock); + } + + NodeAdminImpl(NodeAgentWithSchedulerFactory nodeAgentWithSchedulerFactory, + NodeAgentContextFactory nodeAgentContextFactory, + Optional<AclMaintainer> aclMaintainer, + MetricReceiverWrapper metricReceiver, + Clock clock) { + this.nodeAgentWithSchedulerFactory = nodeAgentWithSchedulerFactory; + this.nodeAgentContextFactory = nodeAgentContextFactory; this.aclMaintainer = aclMaintainer; this.clock = clock; @@ -70,22 +88,33 @@ public class NodeAdminImpl implements NodeAdmin { @Override public void refreshContainersToRun(List<NodeSpec> containersToRun) { - final Set<String> hostnamesOfContainersToRun = containersToRun.stream() - .map(NodeSpec::getHostname) - .collect(Collectors.toSet()); + final Map<String, NodeAgentContext> nodeAgentContextsByHostname = containersToRun.stream() + .collect(Collectors.toMap(NodeSpec::getHostname, nodeAgentContextFactory::create)); - synchronizeNodesToNodeAgents(hostnamesOfContainersToRun); + // Stop and remove NodeAgents that should no longer be running + diff(nodeAgentWithSchedulerByHostname.keySet(), nodeAgentContextsByHostname.keySet()) + .forEach(hostname -> nodeAgentWithSchedulerByHostname.remove(hostname).stop()); + + // Start NodeAgent for hostnames that should be running, but aren't yet + diff(nodeAgentContextsByHostname.keySet(), nodeAgentWithSchedulerByHostname.keySet()).forEach(hostname -> { + NodeAgentWithScheduler naws = nodeAgentWithSchedulerFactory.create(nodeAgentContextsByHostname.get(hostname)); + naws.start(); + nodeAgentWithSchedulerByHostname.put(hostname, naws); + }); - updateNodeAgentMetrics(); + // At this point, nodeAgentContextsByHostname and nodeAgentWithSchedulerByHostname should have the same keys + nodeAgentContextsByHostname.forEach((hostname, context) -> + nodeAgentWithSchedulerByHostname.get(hostname).scheduleTickWith(context) + ); } private void updateNodeAgentMetrics() { int numberContainersWaitingImage = 0; int numberOfNewUnhandledExceptions = 0; - for (NodeAgent nodeAgent : nodeAgentsByHostname.values()) { - if (nodeAgent.isDownloadingImage()) numberContainersWaitingImage++; - numberOfNewUnhandledExceptions += nodeAgent.getAndResetNumberOfUnhandledExceptions(); + for (NodeAgentWithScheduler nodeAgentWithScheduler : nodeAgentWithSchedulerByHostname.values()) { + if (nodeAgentWithScheduler.isDownloadingImage()) numberContainersWaitingImage++; + numberOfNewUnhandledExceptions += nodeAgentWithScheduler.getAndResetNumberOfUnhandledExceptions(); } numberOfContainersInLoadImageState.sample(numberContainersWaitingImage); @@ -105,8 +134,8 @@ public class NodeAdminImpl implements NodeAdmin { } // Use filter with count instead of allMatch() because allMatch() will short circuit on first non-match - boolean allNodeAgentsConverged = nodeAgentsByHostname.values().stream() - .filter(nodeAgent -> !nodeAgent.setFrozen(wantFrozen)) + boolean allNodeAgentsConverged = nodeAgentWithSchedulerByHostname.values().parallelStream() + .filter(nodeAgentScheduler -> !nodeAgentScheduler.setFrozen(wantFrozen, NODE_AGENT_FREEZE_TIMEOUT)) .count() == 0; if (wantFrozen) { @@ -134,8 +163,8 @@ public class NodeAdminImpl implements NodeAdmin { public void stopNodeAgentServices(List<String> hostnames) { // Each container may spend 1-1:30 minutes stopping hostnames.parallelStream() - .filter(nodeAgentsByHostname::containsKey) - .map(nodeAgentsByHostname::get) + .filter(nodeAgentWithSchedulerByHostname::containsKey) + .map(nodeAgentWithSchedulerByHostname::get) .forEach(nodeAgent -> { nodeAgent.suspend(); nodeAgent.stopServices(); @@ -146,7 +175,8 @@ public class NodeAdminImpl implements NodeAdmin { public void start() { metricsScheduler.scheduleAtFixedRate(() -> { try { - nodeAgentsByHostname.values().forEach(NodeAgent::updateContainerNodeMetrics); + updateNodeAgentMetrics(); + nodeAgentWithSchedulerByHostname.values().forEach(NodeAgent::updateContainerNodeMetrics); } catch (Throwable e) { logger.warning("Metric fetcher scheduler failed", e); } @@ -166,7 +196,7 @@ public class NodeAdminImpl implements NodeAdmin { aclScheduler.shutdown(); // Stop all node-agents in parallel, will block until the last NodeAgent is stopped - nodeAgentsByHostname.values().parallelStream().forEach(NodeAgent::stop); + nodeAgentWithSchedulerByHostname.values().parallelStream().forEach(NodeAgent::stop); do { try { @@ -185,23 +215,35 @@ public class NodeAdminImpl implements NodeAdmin { return result; } - void synchronizeNodesToNodeAgents(Set<String> hostnamesToRun) { - // Stop and remove NodeAgents that should no longer be running - diff(nodeAgentsByHostname.keySet(), hostnamesToRun) - .forEach(hostname -> nodeAgentsByHostname.remove(hostname).stop()); + static class NodeAgentWithScheduler implements NodeAgent, NodeAgentScheduler { + private final NodeAgent nodeAgent; + private final NodeAgentScheduler nodeAgentScheduler; - // Start NodeAgent for hostnames that should be running, but aren't yet - diff(hostnamesToRun, nodeAgentsByHostname.keySet()) - .forEach(this::startNodeAgent); + private NodeAgentWithScheduler(NodeAgent nodeAgent, NodeAgentScheduler nodeAgentScheduler) { + this.nodeAgent = nodeAgent; + this.nodeAgentScheduler = nodeAgentScheduler; + } + + @Override public void stopServices() { nodeAgent.stopServices(); } + @Override public void suspend() { nodeAgent.suspend(); } + @Override public void start() { nodeAgent.start(); } + @Override public void stop() { nodeAgent.stop(); } + @Override public void updateContainerNodeMetrics() { nodeAgent.updateContainerNodeMetrics(); } + @Override public boolean isDownloadingImage() { return nodeAgent.isDownloadingImage(); } + @Override public int getAndResetNumberOfUnhandledExceptions() { return nodeAgent.getAndResetNumberOfUnhandledExceptions(); } + + @Override public void scheduleTickWith(NodeAgentContext context) { nodeAgentScheduler.scheduleTickWith(context); } + @Override public boolean setFrozen(boolean frozen, Duration timeout) { return nodeAgentScheduler.setFrozen(frozen, timeout); } } - private void startNodeAgent(String hostname) { - if (nodeAgentsByHostname.containsKey(hostname)) - throw new IllegalArgumentException("Attempted to start NodeAgent for hostname " + hostname + - ", but one is already running!"); + @FunctionalInterface + interface NodeAgentWithSchedulerFactory { + NodeAgentWithScheduler create(NodeAgentContext context); + } - NodeAgent agent = nodeAgentFactory.apply(hostname); - agent.start(); - nodeAgentsByHostname.put(hostname, agent); + private static NodeAgentWithScheduler create(Clock clock, NodeAgentFactory nodeAgentFactory, NodeAgentContext context) { + NodeAgentContextManager contextManager = new NodeAgentContextManager(clock, context); + NodeAgent nodeAgent = nodeAgentFactory.create(contextManager); + return new NodeAgentWithScheduler(nodeAgent, contextManager); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java index 947e7c85d66..10076c4f48a 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java @@ -9,12 +9,6 @@ package com.yahoo.vespa.hosted.node.admin.nodeagent; * @author bakksjo */ public interface NodeAgent { - /** - * Will eventually freeze/unfreeze the node agent - * @param frozen whether node agent should be frozen - * @return True if node agent has converged to the desired state - */ - boolean setFrozen(boolean frozen); /** * Stop services running on node. Depending on the state of the node, {@link #suspend()} might need to be diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContext.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContext.java index 4874eccb913..2e9f58a2c31 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContext.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContext.java @@ -6,6 +6,7 @@ import com.yahoo.vespa.athenz.api.AthenzService; import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.node.admin.component.TaskContext; import com.yahoo.vespa.hosted.node.admin.component.ZoneId; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.docker.DockerNetworking; import java.nio.file.Path; @@ -13,11 +14,17 @@ import java.nio.file.Paths; public interface NodeAgentContext extends TaskContext { + NodeSpec node(); + ContainerName containerName(); - HostName hostname(); + default HostName hostname() { + return HostName.from(node().getHostname()); + } - NodeType nodeType(); + default NodeType nodeType() { + return node().getNodeType(); + } AthenzService identity(); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextFactory.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextFactory.java new file mode 100644 index 00000000000..0cfafe34717 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextFactory.java @@ -0,0 +1,12 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeagent; + +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; + +/** + * @author freva + */ +@FunctionalInterface +public interface NodeAgentContextFactory { + NodeAgentContext create(NodeSpec nodeSpec); +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextImpl.java index 3c34e35ab46..58414ab55f4 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextImpl.java @@ -1,14 +1,15 @@ package com.yahoo.vespa.hosted.node.admin.nodeagent; import com.yahoo.config.provision.Environment; -import com.yahoo.config.provision.HostName; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.SystemName; import com.yahoo.vespa.athenz.api.AthenzService; import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.node.admin.component.ZoneId; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.docker.DockerNetworking; +import com.yahoo.vespa.hosted.provision.Node; import java.nio.file.FileSystem; import java.nio.file.Path; @@ -25,9 +26,8 @@ public class NodeAgentContextImpl implements NodeAgentContext { private static final Path ROOT = Paths.get("/"); private final String logPrefix; + private final NodeSpec node; private final ContainerName containerName; - private final HostName hostName; - private final NodeType nodeType; private final AthenzService identity; private final DockerNetworking dockerNetworking; private final ZoneId zoneId; @@ -36,13 +36,12 @@ public class NodeAgentContextImpl implements NodeAgentContext { private final String vespaUser; private final String vespaUserOnHost; - public NodeAgentContextImpl(String hostname, NodeType nodeType, AthenzService identity, + public NodeAgentContextImpl(NodeSpec node, AthenzService identity, DockerNetworking dockerNetworking, ZoneId zoneId, Path pathToContainerStorage, Path pathToVespaHome, String vespaUser, String vespaUserOnHost) { - this.hostName = HostName.from(Objects.requireNonNull(hostname)); - this.containerName = ContainerName.fromHostname(hostname); - this.nodeType = Objects.requireNonNull(nodeType); + this.node = Objects.requireNonNull(node); + this.containerName = ContainerName.fromHostname(node.getHostname()); this.identity = Objects.requireNonNull(identity); this.dockerNetworking = Objects.requireNonNull(dockerNetworking); this.zoneId = Objects.requireNonNull(zoneId); @@ -54,18 +53,13 @@ public class NodeAgentContextImpl implements NodeAgentContext { } @Override - public ContainerName containerName() { - return containerName; - } - - @Override - public HostName hostname() { - return hostName; + public NodeSpec node() { + return node; } @Override - public NodeType nodeType() { - return nodeType; + public ContainerName containerName() { + return containerName; } @Override @@ -134,12 +128,25 @@ public class NodeAgentContextImpl implements NodeAgentContext { public void log(Logger logger, Level level, String message, Throwable throwable) { logger.log(level, logPrefix + message, throwable); } - + + @Override + public String toString() { + return "NodeAgentContextImpl{" + + "node=" + node + + ", containerName=" + containerName + + ", identity=" + identity + + ", dockerNetworking=" + dockerNetworking + + ", zoneId=" + zoneId + + ", pathToNodeRootOnHost=" + pathToNodeRootOnHost + + ", pathToVespaHome=" + pathToVespaHome + + ", vespaUser='" + vespaUser + '\'' + + ", vespaUserOnHost='" + vespaUserOnHost + '\'' + + '}'; + } /** For testing only! */ public static class Builder { - private final String hostname; - private NodeType nodeType; + private NodeSpec.Builder nodeSpecBuilder = new NodeSpec.Builder(); private AthenzService identity; private DockerNetworking dockerNetworking; private ZoneId zoneId; @@ -148,12 +155,25 @@ public class NodeAgentContextImpl implements NodeAgentContext { private String vespaUser; private String vespaUserOnHost; + public Builder(NodeSpec node) { + this.nodeSpecBuilder = new NodeSpec.Builder(node); + } + + /** + * Creates a NodeAgentContext.Builder with a NodeSpec that has the given hostname and some + * reasonable values for the remaining required NodeSpec fields. Use {@link #Builder(NodeSpec)} + * if you want to control the entire NodeSpec. + */ public Builder(String hostname) { - this.hostname = hostname; + this.nodeSpecBuilder + .hostname(hostname) + .state(Node.State.active) + .nodeType(NodeType.tenant) + .flavor("d-2-8-50"); } public Builder nodeType(NodeType nodeType) { - this.nodeType = nodeType; + this.nodeSpecBuilder.nodeType(nodeType); return this; } @@ -198,8 +218,7 @@ public class NodeAgentContextImpl implements NodeAgentContext { public NodeAgentContextImpl build() { return new NodeAgentContextImpl( - hostname, - Optional.ofNullable(nodeType).orElse(NodeType.tenant), + nodeSpecBuilder.build(), Optional.ofNullable(identity).orElseGet(() -> new AthenzService("domain", "service")), Optional.ofNullable(dockerNetworking).orElse(DockerNetworking.HOST_NETWORK), Optional.ofNullable(zoneId).orElseGet(() -> new ZoneId(SystemName.dev, Environment.dev, RegionName.defaultName())), diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextManager.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextManager.java new file mode 100644 index 00000000000..54f357d5f29 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextManager.java @@ -0,0 +1,102 @@ +// Copyright 2019 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeagent; + +import java.time.Clock; +import java.time.Duration; +import java.util.Objects; + +/** + * This class should be used by exactly 2 thread, 1 for each interface it implements. + * + * @author freva + */ +public class NodeAgentContextManager implements NodeAgentContextSupplier, NodeAgentScheduler { + + private final Object monitor = new Object(); + private final Clock clock; + + private NodeAgentContext currentContext; + private NodeAgentContext nextContext; + private boolean wantFrozen = false; + private boolean isFrozen = true; + private boolean pendingInterrupt = false; + + public NodeAgentContextManager(Clock clock, NodeAgentContext context) { + this.clock = clock; + this.currentContext = context; + } + + @Override + public void scheduleTickWith(NodeAgentContext context) { + synchronized (monitor) { + nextContext = Objects.requireNonNull(context); + monitor.notifyAll(); // Notify of new context + } + } + + @Override + public boolean setFrozen(boolean frozen, Duration timeout) { + synchronized (monitor) { + if (wantFrozen != frozen) { + wantFrozen = frozen; + monitor.notifyAll(); // Notify the supplier of the wantFrozen change + } + + boolean successful; + long remainder; + long end = clock.instant().plus(timeout).toEpochMilli(); + while (!(successful = isFrozen == frozen) && (remainder = end - clock.millis()) > 0) { + try { + monitor.wait(remainder); // Wait with timeout until the supplier is has reached wanted frozen state + } catch (InterruptedException ignored) { } + } + + return successful; + } + } + + @Override + public NodeAgentContext nextContext() throws InterruptedException { + synchronized (monitor) { + while (setAndGetIsFrozen(wantFrozen) || nextContext == null) { + if (pendingInterrupt) { + pendingInterrupt = false; + throw new InterruptedException("interrupt() was called before next context was scheduled"); + } + + try { + monitor.wait(); // Wait until scheduler provides a new context + } catch (InterruptedException ignored) { } + } + + currentContext = nextContext; + nextContext = null; + return currentContext; + } + } + + @Override + public NodeAgentContext currentContext() { + synchronized (monitor) { + return currentContext; + } + } + + @Override + public void interrupt() { + synchronized (monitor) { + pendingInterrupt = true; + monitor.notifyAll(); + } + } + + private boolean setAndGetIsFrozen(boolean isFrozen) { + synchronized (monitor) { + if (this.isFrozen != isFrozen) { + this.isFrozen = isFrozen; + monitor.notifyAll(); // Notify the scheduler of the isFrozen change + } + return this.isFrozen; + } + } +}
\ No newline at end of file diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextSupplier.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextSupplier.java new file mode 100644 index 00000000000..1fc730a3cb0 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextSupplier.java @@ -0,0 +1,21 @@ +// Copyright 2019 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeagent; + +/** + * @author freva + */ +public interface NodeAgentContextSupplier { + + /** + * Blocks until the next context is ready + * @return context + * @throws InterruptedException if {@link #interrupt()} was called before this method returned + */ + NodeAgentContext nextContext() throws InterruptedException; + + /** @return the last context returned by {@link #nextContext()} or a default value */ + NodeAgentContext currentContext(); + + /** Interrupts the thread(s) currently waiting in {@link #nextContext()} */ + void interrupt(); +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentFactory.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentFactory.java new file mode 100644 index 00000000000..bd13b7eb094 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentFactory.java @@ -0,0 +1,10 @@ +// Copyright 2019 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeagent; + +/** + * @author freva + */ +@FunctionalInterface +public interface NodeAgentFactory { + NodeAgent create(NodeAgentContextSupplier contextSupplier); +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 98975dddb56..0bfff82a055 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -17,20 +17,17 @@ import com.yahoo.vespa.hosted.dockerapi.metrics.Dimensions; import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeAttributes; +import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException; import com.yahoo.vespa.hosted.node.admin.docker.DockerOperations; import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; -import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException; import com.yahoo.vespa.hosted.node.admin.maintenance.acl.AclMaintainer; import com.yahoo.vespa.hosted.node.admin.maintenance.identity.AthenzCredentialsMaintainer; import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import com.yahoo.vespa.hosted.node.admin.util.SecretAgentCheckConfig; import com.yahoo.vespa.hosted.provision.Node; -import java.time.Clock; -import java.time.Duration; -import java.time.Instant; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -59,30 +56,21 @@ public class NodeAgentImpl implements NodeAgent { private static final Logger logger = Logger.getLogger(NodeAgentImpl.class.getName()); - private final Object monitor = new Object(); private final AtomicBoolean terminated = new AtomicBoolean(false); - - private boolean isFrozen = true; - private boolean wantFrozen = false; - private boolean workToDoNow = true; - private boolean expectNodeNotInNodeRepo = false; private boolean hasResumedNode = false; private boolean hasStartedServices = true; - private final NodeAgentContext context; + private final NodeAgentContextSupplier contextSupplier; private final NodeRepository nodeRepository; private final Orchestrator orchestrator; private final DockerOperations dockerOperations; private final StorageMaintainer storageMaintainer; - private final Clock clock; - private final Duration timeBetweenEachConverge; private final Optional<AthenzCredentialsMaintainer> athenzCredentialsMaintainer; private final Optional<AclMaintainer> aclMaintainer; private final Optional<HealthChecker> healthChecker; private int numberOfUnhandledException = 0; private DockerImage imageBeingDownloaded = null; - private Instant lastConverge; private long currentRebootGeneration = 0; private Optional<Long> currentRestartGeneration = Optional.empty(); @@ -115,24 +103,19 @@ public class NodeAgentImpl implements NodeAgent { // Created in NodeAdminImpl public NodeAgentImpl( - final NodeAgentContext context, + final NodeAgentContextSupplier contextSupplier, final NodeRepository nodeRepository, final Orchestrator orchestrator, final DockerOperations dockerOperations, final StorageMaintainer storageMaintainer, - final Clock clock, - final Duration timeBetweenEachConverge, final Optional<AthenzCredentialsMaintainer> athenzCredentialsMaintainer, final Optional<AclMaintainer> aclMaintainer, final Optional<HealthChecker> healthChecker) { - this.context = context; + this.contextSupplier = contextSupplier; this.nodeRepository = nodeRepository; this.orchestrator = orchestrator; this.dockerOperations = dockerOperations; this.storageMaintainer = storageMaintainer; - this.clock = clock; - this.timeBetweenEachConverge = timeBetweenEachConverge; - this.lastConverge = clock.instant(); this.athenzCredentialsMaintainer = athenzCredentialsMaintainer; this.aclMaintainer = aclMaintainer; this.healthChecker = healthChecker; @@ -140,16 +123,15 @@ public class NodeAgentImpl implements NodeAgent { this.loopThread = new Thread(() -> { while (!terminated.get()) { try { - tick(); - } catch (Throwable t) { - numberOfUnhandledException++; - context.log(logger, LogLevel.ERROR, "Unhandled throwable, ignoring", t); - } + NodeAgentContext context = contextSupplier.nextContext(); + converge(context); + } catch (InterruptedException ignored) { } } }); - this.loopThread.setName("tick-" + context.hostname()); + this.loopThread.setName("tick-" + contextSupplier.currentContext().hostname()); this.serviceRestarter = service -> { + NodeAgentContext context = contextSupplier.currentContext(); try { ProcessResult processResult = dockerOperations.executeCommandInContainerAsRoot( context, "service", service, "restart"); @@ -164,46 +146,29 @@ public class NodeAgentImpl implements NodeAgent { } @Override - public boolean setFrozen(boolean frozen) { - synchronized (monitor) { - if (wantFrozen != frozen) { - wantFrozen = frozen; - context.log(logger, LogLevel.DEBUG, wantFrozen ? "Freezing" : "Unfreezing"); - signalWorkToBeDone(); - } - - return isFrozen == frozen; - } - } - - @Override public void start() { - context.log(logger, "Starting with interval " + timeBetweenEachConverge.toMillis() + " ms"); loopThread.start(); } @Override public void stop() { - filebeatRestarter.shutdown(); if (!terminated.compareAndSet(false, true)) { throw new RuntimeException("Can not re-stop a node agent."); } - signalWorkToBeDone(); + filebeatRestarter.shutdown(); + contextSupplier.interrupt(); do { try { loopThread.join(); filebeatRestarter.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); - } catch (InterruptedException e) { - context.log(logger, LogLevel.ERROR, - "Interrupted while waiting for converge thread and filebeatRestarter scheduler to shutdown"); - } + } catch (InterruptedException ignored) { } } while (loopThread.isAlive() || !filebeatRestarter.isTerminated()); - context.log(logger, "Stopped"); + contextSupplier.currentContext().log(logger, "Stopped"); } - void startServicesIfNeeded() { + void startServicesIfNeeded(NodeAgentContext context) { if (!hasStartedServices) { context.log(logger, "Starting services"); dockerOperations.startServices(context); @@ -211,10 +176,10 @@ public class NodeAgentImpl implements NodeAgent { } } - void resumeNodeIfNeeded(NodeSpec node) { + void resumeNodeIfNeeded(NodeAgentContext context) { if (!hasResumedNode) { if (!currentFilebeatRestarter.isPresent()) { - storageMaintainer.writeMetricsConfig(context, node); + storageMaintainer.writeMetricsConfig(context); currentFilebeatRestarter = Optional.of(filebeatRestarter.scheduleWithFixedDelay( () -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS)); } @@ -225,31 +190,31 @@ public class NodeAgentImpl implements NodeAgent { } } - private void updateNodeRepoWithCurrentAttributes(final NodeSpec node) { + private void updateNodeRepoWithCurrentAttributes(NodeAgentContext context) { final NodeAttributes currentNodeAttributes = new NodeAttributes(); final NodeAttributes newNodeAttributes = new NodeAttributes(); - if (node.getWantedRestartGeneration().isPresent() && - !Objects.equals(node.getCurrentRestartGeneration(), currentRestartGeneration)) { - currentNodeAttributes.withRestartGeneration(node.getCurrentRestartGeneration()); + if (context.node().getWantedRestartGeneration().isPresent() && + !Objects.equals(context.node().getCurrentRestartGeneration(), currentRestartGeneration)) { + currentNodeAttributes.withRestartGeneration(context.node().getCurrentRestartGeneration()); newNodeAttributes.withRestartGeneration(currentRestartGeneration); } - if (!Objects.equals(node.getCurrentRebootGeneration(), currentRebootGeneration)) { - currentNodeAttributes.withRebootGeneration(node.getCurrentRebootGeneration()); + if (!Objects.equals(context.node().getCurrentRebootGeneration(), currentRebootGeneration)) { + currentNodeAttributes.withRebootGeneration(context.node().getCurrentRebootGeneration()); newNodeAttributes.withRebootGeneration(currentRebootGeneration); } - Optional<DockerImage> actualDockerImage = node.getWantedDockerImage().filter(n -> containerState == UNKNOWN); - if (!Objects.equals(node.getCurrentDockerImage(), actualDockerImage)) { - currentNodeAttributes.withDockerImage(node.getCurrentDockerImage().orElse(new DockerImage(""))); + Optional<DockerImage> actualDockerImage = context.node().getWantedDockerImage().filter(n -> containerState == UNKNOWN); + if (!Objects.equals(context.node().getCurrentDockerImage(), actualDockerImage)) { + currentNodeAttributes.withDockerImage(context.node().getCurrentDockerImage().orElse(new DockerImage(""))); newNodeAttributes.withDockerImage(actualDockerImage.orElse(new DockerImage(""))); } - publishStateToNodeRepoIfChanged(currentNodeAttributes, newNodeAttributes); + publishStateToNodeRepoIfChanged(context, currentNodeAttributes, newNodeAttributes); } - private void publishStateToNodeRepoIfChanged(NodeAttributes currentAttributes, NodeAttributes newAttributes) { + private void publishStateToNodeRepoIfChanged(NodeAgentContext context, NodeAttributes currentAttributes, NodeAttributes newAttributes) { if (!currentAttributes.equals(newAttributes)) { context.log(logger, "Publishing new set of attributes to node repo: %s -> %s", currentAttributes, newAttributes); @@ -257,9 +222,9 @@ public class NodeAgentImpl implements NodeAgent { } } - private void startContainer(NodeSpec node) { - ContainerData containerData = createContainerData(context, node); - dockerOperations.createContainer(context, node, containerData); + private void startContainer(NodeAgentContext context) { + ContainerData containerData = createContainerData(context); + dockerOperations.createContainer(context, containerData); dockerOperations.startContainer(context); lastCpuMetric = new CpuUsageReporter(); @@ -268,14 +233,15 @@ public class NodeAgentImpl implements NodeAgent { context.log(logger, "Container successfully started, new containerState is " + containerState); } - private Optional<Container> removeContainerIfNeededUpdateContainerState(NodeSpec node, Optional<Container> existingContainer) { + private Optional<Container> removeContainerIfNeededUpdateContainerState( + NodeAgentContext context, Optional<Container> existingContainer) { return existingContainer - .flatMap(container -> removeContainerIfNeeded(node, container)) + .flatMap(container -> removeContainerIfNeeded(context, container)) .map(container -> { - shouldRestartServices(node).ifPresent(restartReason -> { + shouldRestartServices(context.node()).ifPresent(restartReason -> { context.log(logger, "Will restart services: " + restartReason); - restartServices(node, container); - currentRestartGeneration = node.getWantedRestartGeneration(); + restartServices(context, container); + currentRestartGeneration = context.node().getWantedRestartGeneration(); }); return container; }); @@ -292,17 +258,18 @@ public class NodeAgentImpl implements NodeAgent { return Optional.empty(); } - private void restartServices(NodeSpec node, Container existingContainer) { - if (existingContainer.state.isRunning() && node.getState() == Node.State.active) { + private void restartServices(NodeAgentContext context, Container existingContainer) { + if (existingContainer.state.isRunning() && context.node().getState() == Node.State.active) { context.log(logger, "Restarting services"); // Since we are restarting the services we need to suspend the node. - orchestratorSuspendNode(); + orchestratorSuspendNode(context); dockerOperations.restartVespa(context); } } @Override public void stopServices() { + NodeAgentContext context = contextSupplier.currentContext(); context.log(logger, "Stopping services"); if (containerState == ABSENT) return; try { @@ -315,6 +282,7 @@ public class NodeAgentImpl implements NodeAgent { @Override public void suspend() { + NodeAgentContext context = contextSupplier.currentContext(); context.log(logger, "Suspending services on node"); if (containerState == ABSENT) return; try { @@ -358,18 +326,18 @@ public class NodeAgentImpl implements NodeAgent { return Optional.empty(); } - private Optional<Container> removeContainerIfNeeded(NodeSpec node, Container existingContainer) { - Optional<String> removeReason = shouldRemoveContainer(node, existingContainer); + private Optional<Container> removeContainerIfNeeded(NodeAgentContext context, Container existingContainer) { + Optional<String> removeReason = shouldRemoveContainer(context.node(), existingContainer); if (removeReason.isPresent()) { context.log(logger, "Will remove container: " + removeReason.get()); if (existingContainer.state.isRunning()) { - if (node.getState() == Node.State.active) { - orchestratorSuspendNode(); + if (context.node().getState() == Node.State.active) { + orchestratorSuspendNode(context); } try { - if (node.getState() != Node.State.dirty) { + if (context.node().getState() != Node.State.dirty) { suspend(); } stopServices(); @@ -378,9 +346,9 @@ public class NodeAgentImpl implements NodeAgent { } } stopFilebeatSchedulerIfNeeded(); - storageMaintainer.handleCoreDumpsForContainer(context, node, Optional.of(existingContainer)); + storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer)); dockerOperations.removeContainer(context, existingContainer); - currentRebootGeneration = node.getWantedRebootGeneration(); + currentRebootGeneration = context.node().getWantedRebootGeneration(); containerState = ABSENT; context.log(logger, "Container successfully removed, new containerState is " + containerState); return Optional.empty(); @@ -399,78 +367,29 @@ public class NodeAgentImpl implements NodeAgent { } } - private void signalWorkToBeDone() { - synchronized (monitor) { - if (!workToDoNow) { - workToDoNow = true; - context.log(logger, LogLevel.DEBUG, "Signaling work to be done"); - monitor.notifyAll(); - } - } - } - - void tick() { - boolean isFrozenCopy; - synchronized (monitor) { - while (!workToDoNow) { - long remainder = timeBetweenEachConverge - .minus(Duration.between(lastConverge, clock.instant())) - .toMillis(); - if (remainder > 0) { - try { - monitor.wait(remainder); - } catch (InterruptedException e) { - context.log(logger, LogLevel.ERROR, "Interrupted while sleeping before tick, ignoring"); - } - } else break; - } - lastConverge = clock.instant(); - workToDoNow = false; - - if (isFrozen != wantFrozen) { - isFrozen = wantFrozen; - context.log(logger, "Updated NodeAgent's frozen state, new value: isFrozen: " + isFrozen); - } - isFrozenCopy = isFrozen; - } - - if (isFrozenCopy) { - context.log(logger, LogLevel.DEBUG, "tick: isFrozen"); - } else { - try { - converge(); - } catch (OrchestratorException | ConvergenceException e) { - context.log(logger, e.getMessage()); - } catch (ContainerNotFoundException e) { - containerState = ABSENT; - context.log(logger, LogLevel.WARNING, "Container unexpectedly gone, resetting containerState to " + containerState); - } catch (DockerException e) { - numberOfUnhandledException++; - context.log(logger, LogLevel.ERROR, "Caught a DockerException", e); - } catch (Exception e) { - numberOfUnhandledException++; - context.log(logger, LogLevel.ERROR, "Unhandled exception, ignoring.", e); - } + public void converge(NodeAgentContext context) { + try { + doConverge(context); + } catch (OrchestratorException | ConvergenceException e) { + context.log(logger, e.getMessage()); + } catch (ContainerNotFoundException e) { + containerState = ABSENT; + context.log(logger, LogLevel.WARNING, "Container unexpectedly gone, resetting containerState to " + containerState); + } catch (DockerException e) { + numberOfUnhandledException++; + context.log(logger, LogLevel.ERROR, "Caught a DockerException", e); + } catch (Throwable e) { + numberOfUnhandledException++; + context.log(logger, LogLevel.ERROR, "Unhandled exception, ignoring", e); } } // Public for testing - void converge() { - final Optional<NodeSpec> optionalNode = nodeRepository.getOptionalNode(context.hostname().value()); - - // We just removed the node from node repo, so this is expected until NodeAdmin stop this NodeAgent - if (!optionalNode.isPresent() && expectNodeNotInNodeRepo) { - context.log(logger, LogLevel.INFO, "Node removed from node repo (as expected)"); - return; - } - - final NodeSpec node = optionalNode.orElseThrow(() -> - new IllegalStateException(String.format("Node '%s' missing from node repository", context.hostname()))); - expectNodeNotInNodeRepo = false; - - Optional<Container> container = getContainer(); + void doConverge(NodeAgentContext context) { + NodeSpec node = context.node(); + Optional<Container> container = getContainer(context); if (!node.equals(lastNode)) { - logChangesToNodeSpec(lastNode, node); + logChangesToNodeSpec(context, lastNode, node); // Current reboot generation uninitialized or incremented from outside to cancel reboot if (currentRebootGeneration < node.getCurrentRebootGeneration()) @@ -485,7 +404,7 @@ public class NodeAgentImpl implements NodeAgent { // Every time the node spec changes, we should clear the metrics for this container as the dimensions // will change and we will be reporting duplicate metrics. if (container.map(c -> c.state.isRunning()).orElse(false)) { - storageMaintainer.writeMetricsConfig(context, node); + storageMaintainer.writeMetricsConfig(context); } lastNode = node; @@ -496,11 +415,11 @@ public class NodeAgentImpl implements NodeAgent { case reserved: case parked: case failed: - removeContainerIfNeededUpdateContainerState(node, container); - updateNodeRepoWithCurrentAttributes(node); + removeContainerIfNeededUpdateContainerState(context, container); + updateNodeRepoWithCurrentAttributes(context); break; case active: - storageMaintainer.handleCoreDumpsForContainer(context, node, container); + storageMaintainer.handleCoreDumpsForContainer(context, container); storageMaintainer.getDiskUsageFor(context) .map(diskUsage -> (double) diskUsage / BYTES_IN_GB / node.getMinDiskAvailableGb()) @@ -512,17 +431,17 @@ public class NodeAgentImpl implements NodeAgent { context.log(logger, LogLevel.DEBUG, "Waiting for image to download " + imageBeingDownloaded.asString()); return; } - container = removeContainerIfNeededUpdateContainerState(node, container); + container = removeContainerIfNeededUpdateContainerState(context, container); athenzCredentialsMaintainer.ifPresent(maintainer -> maintainer.converge(context)); if (! container.isPresent()) { containerState = STARTING; - startContainer(node); + startContainer(context); containerState = UNKNOWN; aclMaintainer.ifPresent(AclMaintainer::converge); } - startServicesIfNeeded(); - resumeNodeIfNeeded(node); + startServicesIfNeeded(context); + resumeNodeIfNeeded(context); healthChecker.ifPresent(checker -> checker.verifyHealth(context)); // Because it's more important to stop a bad release from rolling out in prod, @@ -535,32 +454,31 @@ public class NodeAgentImpl implements NodeAgent { // has been successfully rolled out. // - Slobrok and internal orchestrator state is used to determine whether // to allow upgrade (suspend). - updateNodeRepoWithCurrentAttributes(node); + updateNodeRepoWithCurrentAttributes(context); context.log(logger, "Call resume against Orchestrator"); orchestrator.resume(context.hostname().value()); break; case inactive: - removeContainerIfNeededUpdateContainerState(node, container); - updateNodeRepoWithCurrentAttributes(node); + removeContainerIfNeededUpdateContainerState(context, container); + updateNodeRepoWithCurrentAttributes(context); break; case provisioned: nodeRepository.setNodeState(context.hostname().value(), Node.State.dirty); break; case dirty: - removeContainerIfNeededUpdateContainerState(node, container); + removeContainerIfNeededUpdateContainerState(context, container); context.log(logger, "State is " + node.getState() + ", will delete application storage and mark node as ready"); athenzCredentialsMaintainer.ifPresent(maintainer -> maintainer.clearCredentials(context)); storageMaintainer.archiveNodeStorage(context); - updateNodeRepoWithCurrentAttributes(node); + updateNodeRepoWithCurrentAttributes(context); nodeRepository.setNodeState(context.hostname().value(), Node.State.ready); - expectNodeNotInNodeRepo = true; break; default: throw new RuntimeException("UNKNOWN STATE " + node.getState().name()); } } - private void logChangesToNodeSpec(NodeSpec lastNode, NodeSpec node) { + private static void logChangesToNodeSpec(NodeAgentContext context, NodeSpec lastNode, NodeSpec node) { StringBuilder builder = new StringBuilder(); appendIfDifferent(builder, "state", lastNode, node, NodeSpec::getState); if (builder.length() > 0) { @@ -572,7 +490,7 @@ public class NodeAgentImpl implements NodeAgent { return value == null ? "[absent]" : value.toString(); } - private <T> void appendIfDifferent(StringBuilder builder, String name, NodeSpec oldNode, NodeSpec newNode, Function<NodeSpec, T> getter) { + private static <T> void appendIfDifferent(StringBuilder builder, String name, NodeSpec oldNode, NodeSpec newNode, Function<NodeSpec, T> getter) { T oldValue = oldNode == null ? null : getter.apply(oldNode); T newValue = getter.apply(newNode); if (!Objects.equals(oldValue, newValue)) { @@ -592,8 +510,9 @@ public class NodeAgentImpl implements NodeAgent { @SuppressWarnings("unchecked") public void updateContainerNodeMetrics() { - final NodeSpec node = lastNode; - if (node == null || containerState != UNKNOWN) return; + if (containerState != UNKNOWN) return; + final NodeAgentContext context = contextSupplier.currentContext(); + final NodeSpec node = context.node(); Optional<ContainerStats> containerStats = dockerOperations.getContainerStats(context); if (!containerStats.isPresent()) return; @@ -660,10 +579,10 @@ public class NodeAgentImpl implements NodeAgent { metrics.add(networkMetrics); }); - pushMetricsToContainer(metrics); + pushMetricsToContainer(context, metrics); } - private void pushMetricsToContainer(List<DimensionMetrics> metrics) { + private void pushMetricsToContainer(NodeAgentContext context, List<DimensionMetrics> metrics) { StringBuilder params = new StringBuilder(); try { for (DimensionMetrics dimensionMetrics : metrics) { @@ -679,7 +598,7 @@ public class NodeAgentImpl implements NodeAgent { } } - private Optional<Container> getContainer() { + private Optional<Container> getContainer(NodeAgentContext context) { if (containerState == ABSENT) return Optional.empty(); Optional<Container> container = dockerOperations.getContainer(context); if (! container.isPresent()) containerState = ABSENT; @@ -743,12 +662,12 @@ public class NodeAgentImpl implements NodeAgent { // More generally, the node repo response should contain sufficient info on what the docker image is, // to allow the node admin to make decisions that depend on the docker image. Or, each docker image // needs to contain routines for drain and suspend. For many images, these can just be dummy routines. - private void orchestratorSuspendNode() { + private void orchestratorSuspendNode(NodeAgentContext context) { context.log(logger, "Ask Orchestrator for permission to suspend node"); orchestrator.suspend(context.hostname().value()); } - protected ContainerData createContainerData(NodeAgentContext context, NodeSpec node) { + protected ContainerData createContainerData(NodeAgentContext context) { return (pathInContainer, data) -> { throw new UnsupportedOperationException("addFile not implemented"); }; diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentScheduler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentScheduler.java new file mode 100644 index 00000000000..540601ffa4f --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentScheduler.java @@ -0,0 +1,21 @@ +// Copyright 2019 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeagent; + +import java.time.Duration; + +/** + * @author freva + */ +public interface NodeAgentScheduler { + + /** Schedule a tick for NodeAgent to run with the given NodeAgentContext */ + void scheduleTickWith(NodeAgentContext context); + + /** + * Will eventually freeze/unfreeze the node agent + * @param frozen whether node agent should be frozen + * @param timeout maximum duration this method should block while waiting for NodeAgent to reach target state + * @return True if node agent has converged to the desired state + */ + boolean setFrozen(boolean frozen, Duration timeout); +} diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java index cb3a1fb5e2c..109bce4c13f 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java @@ -15,8 +15,9 @@ import com.yahoo.vespa.hosted.node.admin.docker.DockerOperationsImpl; import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminImpl; import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater; -import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgent; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextFactory; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentFactory; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl; import com.yahoo.vespa.hosted.node.admin.task.util.network.IPAddressesMock; import com.yahoo.vespa.hosted.provision.Node; @@ -30,7 +31,6 @@ import java.nio.file.Paths; import java.time.Clock; import java.time.Duration; import java.util.Optional; -import java.util.function.Function; import java.util.logging.Logger; import static com.yahoo.yolean.Exceptions.uncheck; @@ -87,19 +87,20 @@ public class DockerTester implements AutoCloseable { .build(); nodeRepository.updateNodeRepositoryNode(hostSpec); - Clock clock = Clock.systemUTC(); FileSystem fileSystem = TestFileSystem.create(); DockerOperations dockerOperations = new DockerOperationsImpl(docker, processExecuter, ipAddresses); MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation); - Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl( - new NodeAgentContextImpl.Builder(hostName).fileSystem(fileSystem).build(), nodeRepository, - orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty(), Optional.empty()); - nodeAdmin = new NodeAdminImpl(nodeAgentFactory, Optional.empty(), mr, Clock.systemUTC()); + NodeAgentFactory nodeAgentFactory = contextSupplier -> new NodeAgentImpl( + contextSupplier, nodeRepository, + orchestrator, dockerOperations, storageMaintainer, Optional.empty(), Optional.empty(), Optional.empty()); + NodeAgentContextFactory nodeAgentContextFactory = nodeSpec -> + new NodeAgentContextImpl.Builder(nodeSpec).fileSystem(fileSystem).build(); + nodeAdmin = new NodeAdminImpl(nodeAgentFactory, nodeAgentContextFactory, Optional.empty(), mr, Clock.systemUTC()); nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, orchestrator, nodeAdmin, HOST_HOSTNAME); - this.loopThread = new Thread(() -> { + loopThread = new Thread(() -> { nodeAdminStateUpdater.start(); while (! terminated) { @@ -135,8 +136,10 @@ public class DockerTester implements AutoCloseable { @Override public void close() { - terminated = true; + // First, stop NodeAdmin and all the NodeAgents + nodeAdmin.stop(); + terminated = true; do { try { loopThread.join(); @@ -144,8 +147,5 @@ public class DockerTester implements AutoCloseable { e.printStackTrace(); } } while (loopThread.isAlive()); - - // Finally, stop NodeAdmin and all the NodeAgents - nodeAdmin.stop(); } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java index 9ea5c87511b..05b9c413594 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java @@ -152,12 +152,8 @@ public class StorageMaintainerTest { } private Path executeAs(NodeType nodeType) { - NodeAgentContext context = new NodeAgentContextImpl.Builder("host123-5.test.domain.tld") - .nodeType(nodeType) - .fileSystem(TestFileSystem.create()) - .zoneId(new ZoneId(SystemName.dev, Environment.prod, RegionName.from("us-north-1"))).build(); NodeSpec nodeSpec = new NodeSpec.Builder() - .hostname(context.hostname().value()) + .hostname("host123-5.test.domain.tld") .nodeType(nodeType) .state(Node.State.active) .parentHostname("host123.test.domain.tld") @@ -167,9 +163,12 @@ public class StorageMaintainerTest { .flavor("d-2-8-50") .canonicalFlavor("d-2-8-50") .build(); + NodeAgentContext context = new NodeAgentContextImpl.Builder(nodeSpec) + .fileSystem(TestFileSystem.create()) + .zoneId(new ZoneId(SystemName.dev, Environment.prod, RegionName.from("us-north-1"))).build(); Path path = context.pathOnHostFromPathInNode("/etc/yamas-agent"); uncheck(() -> Files.createDirectories(path)); - storageMaintainer.writeMetricsConfig(context, nodeSpec); + storageMaintainer.writeMetricsConfig(context); return path; } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImplTest.java index 3860e2e9780..47e220a968b 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImplTest.java @@ -1,22 +1,23 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeadmin; +import com.yahoo.config.provision.NodeType; import com.yahoo.metrics.simple.MetricReceiver; import com.yahoo.test.ManualClock; import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; -import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgent; -import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextFactory; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; +import com.yahoo.vespa.hosted.provision.Node; import org.junit.Test; import org.mockito.InOrder; import java.time.Duration; import java.util.ArrayList; import java.util.Collections; -import java.util.HashSet; import java.util.List; import java.util.Optional; -import java.util.Set; -import java.util.function.Function; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -31,75 +32,72 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verifyNoMoreInteractions; import static org.mockito.Mockito.when; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminImpl.NodeAgentWithScheduler; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminImpl.NodeAgentWithSchedulerFactory; + /** * @author bakksjo */ public class NodeAdminImplTest { - // Trick to allow mocking of typed interface without casts/warnings. - private interface NodeAgentFactory extends Function<String, NodeAgent> {} - private final Function<String, NodeAgent> nodeAgentFactory = mock(NodeAgentFactory.class); + + private final NodeAgentWithSchedulerFactory nodeAgentWithSchedulerFactory = mock(NodeAgentWithSchedulerFactory.class); + private final NodeAgentContextFactory nodeAgentContextFactory = mock(NodeAgentContextFactory.class); private final ManualClock clock = new ManualClock(); - private final NodeAdminImpl nodeAdmin = new NodeAdminImpl(nodeAgentFactory, Optional.empty(), - new MetricReceiverWrapper(MetricReceiver.nullImplementation), clock); + private final NodeAdminImpl nodeAdmin = new NodeAdminImpl(nodeAgentWithSchedulerFactory, nodeAgentContextFactory, + Optional.empty(), new MetricReceiverWrapper(MetricReceiver.nullImplementation), clock); @Test public void nodeAgentsAreProperlyLifeCycleManaged() { - final String hostName1 = "host1.test.yahoo.com"; - final String hostName2 = "host2.test.yahoo.com"; - final NodeAgent nodeAgent1 = mock(NodeAgentImpl.class); - final NodeAgent nodeAgent2 = mock(NodeAgentImpl.class); - when(nodeAgentFactory.apply(eq(hostName1))).thenReturn(nodeAgent1); - when(nodeAgentFactory.apply(eq(hostName2))).thenReturn(nodeAgent2); + final NodeSpec nodeSpec1 = createNodeSpec("host1.test.yahoo.com"); + final NodeSpec nodeSpec2 = createNodeSpec("host2.test.yahoo.com"); + final NodeAgentWithScheduler nodeAgent1 = mockNodeAgentWithSchedulerFactory(nodeSpec1); + final NodeAgentWithScheduler nodeAgent2 = mockNodeAgentWithSchedulerFactory(nodeSpec2); + final InOrder inOrder = inOrder(nodeAgentWithSchedulerFactory, nodeAgent1, nodeAgent2); + nodeAdmin.refreshContainersToRun(Collections.emptyList()); + verifyNoMoreInteractions(nodeAgentWithSchedulerFactory); - final InOrder inOrder = inOrder(nodeAgentFactory, nodeAgent1, nodeAgent2); - nodeAdmin.synchronizeNodesToNodeAgents(Collections.emptySet()); - verifyNoMoreInteractions(nodeAgentFactory); - - nodeAdmin.synchronizeNodesToNodeAgents(Collections.singleton(hostName1)); - inOrder.verify(nodeAgentFactory).apply(hostName1); + nodeAdmin.refreshContainersToRun(Collections.singletonList(nodeSpec1)); inOrder.verify(nodeAgent1).start(); + inOrder.verify(nodeAgent2, never()).start(); inOrder.verify(nodeAgent1, never()).stop(); - nodeAdmin.synchronizeNodesToNodeAgents(Collections.singleton(hostName1)); - inOrder.verify(nodeAgentFactory, never()).apply(any(String.class)); + nodeAdmin.refreshContainersToRun(Collections.singletonList(nodeSpec1)); + inOrder.verify(nodeAgentWithSchedulerFactory, never()).create(any()); inOrder.verify(nodeAgent1, never()).start(); inOrder.verify(nodeAgent1, never()).stop(); - nodeAdmin.synchronizeNodesToNodeAgents(Collections.emptySet()); - inOrder.verify(nodeAgentFactory, never()).apply(any(String.class)); + nodeAdmin.refreshContainersToRun(Collections.emptyList()); + inOrder.verify(nodeAgentWithSchedulerFactory, never()).create(any()); verify(nodeAgent1).stop(); - nodeAdmin.synchronizeNodesToNodeAgents(Collections.singleton(hostName2)); - inOrder.verify(nodeAgentFactory).apply(hostName2); + nodeAdmin.refreshContainersToRun(Collections.singletonList(nodeSpec2)); inOrder.verify(nodeAgent2).start(); inOrder.verify(nodeAgent2, never()).stop(); - verify(nodeAgent1).stop(); + inOrder.verify(nodeAgent1, never()).stop(); - nodeAdmin.synchronizeNodesToNodeAgents(Collections.emptySet()); - inOrder.verify(nodeAgentFactory, never()).apply(any(String.class)); + nodeAdmin.refreshContainersToRun(Collections.emptyList()); + inOrder.verify(nodeAgentWithSchedulerFactory, never()).create(any()); inOrder.verify(nodeAgent2, never()).start(); inOrder.verify(nodeAgent2).stop(); - - verifyNoMoreInteractions(nodeAgent1); - verifyNoMoreInteractions(nodeAgent2); + inOrder.verify(nodeAgent1, never()).start(); + inOrder.verify(nodeAgent1, never()).stop(); } @Test public void testSetFrozen() { - List<NodeAgent> nodeAgents = new ArrayList<>(); - Set<String> existingContainerHostnames = new HashSet<>(); + List<NodeSpec> nodeSpecs = new ArrayList<>(); + List<NodeAgentWithScheduler> nodeAgents = new ArrayList<>(); for (int i = 0; i < 3; i++) { - final String hostName = "host" + i + ".test.yahoo.com"; - NodeAgent nodeAgent = mock(NodeAgent.class); - nodeAgents.add(nodeAgent); - when(nodeAgentFactory.apply(eq(hostName))).thenReturn(nodeAgent); + NodeSpec nodeSpec = createNodeSpec("host" + i + ".test.yahoo.com"); + NodeAgentWithScheduler nodeAgent = mockNodeAgentWithSchedulerFactory(nodeSpec); - existingContainerHostnames.add(hostName); + nodeSpecs.add(nodeSpec); + nodeAgents.add(nodeAgent); } - nodeAdmin.synchronizeNodesToNodeAgents(existingContainerHostnames); + nodeAdmin.refreshContainersToRun(nodeSpecs); assertTrue(nodeAdmin.isFrozen()); // Initially everything is frozen to force convergence mockNodeAgentSetFrozenResponse(nodeAgents, true, true, true); @@ -155,10 +153,28 @@ public class NodeAdminImplTest { assertEquals(Duration.ofSeconds(1), nodeAdmin.subsystemFreezeDuration()); } - private void mockNodeAgentSetFrozenResponse(List<NodeAgent> nodeAgents, boolean... responses) { + private void mockNodeAgentSetFrozenResponse(List<NodeAgentWithScheduler> nodeAgents, boolean... responses) { for (int i = 0; i < nodeAgents.size(); i++) { - NodeAgent nodeAgent = nodeAgents.get(i); - when(nodeAgent.setFrozen(anyBoolean())).thenReturn(responses[i]); + NodeAgentWithScheduler nodeAgent = nodeAgents.get(i); + when(nodeAgent.setFrozen(anyBoolean(), any())).thenReturn(responses[i]); } } + + private NodeSpec createNodeSpec(String hostname) { + return new NodeSpec.Builder() + .hostname(hostname) + .state(Node.State.active) + .nodeType(NodeType.tenant) + .flavor("default") + .build(); + } + + private NodeAgentWithScheduler mockNodeAgentWithSchedulerFactory(NodeSpec nodeSpec) { + NodeAgentContext context = new NodeAgentContextImpl.Builder(nodeSpec).build(); + when(nodeAgentContextFactory.create(eq(nodeSpec))).thenReturn(context); + + NodeAgentWithScheduler nodeAgentWithScheduler = mock(NodeAgentWithScheduler.class); + when(nodeAgentWithSchedulerFactory.create(eq(context))).thenReturn(nodeAgentWithScheduler); + return nodeAgentWithScheduler; + } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextManagerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextManagerTest.java new file mode 100644 index 00000000000..f32e3d91e34 --- /dev/null +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentContextManagerTest.java @@ -0,0 +1,142 @@ +// Copyright 2019 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeagent; + +import org.junit.Test; + +import java.time.Clock; +import java.time.Duration; +import java.util.Optional; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; + +/** + * @author freva + */ +public class NodeAgentContextManagerTest { + + private static final int TIMEOUT = 10_000; + + private final Clock clock = Clock.systemUTC(); + private final NodeAgentContext initialContext = generateContext(); + private final NodeAgentContextManager manager = new NodeAgentContextManager(clock, initialContext); + + @Test(timeout = TIMEOUT) + public void returns_immediately_if_next_context_is_ready() throws InterruptedException { + NodeAgentContext context1 = generateContext(); + manager.scheduleTickWith(context1); + + assertSame(initialContext, manager.currentContext()); + assertSame(context1, manager.nextContext()); + assertSame(context1, manager.currentContext()); + } + + @Test(timeout = TIMEOUT) + public void blocks_in_nextContext_until_one_is_scheduled() throws InterruptedException { + AsyncExecutor<NodeAgentContext> async = new AsyncExecutor<>(manager::nextContext); + assertFalse(async.response.isPresent()); + Thread.sleep(10); + assertFalse(async.response.isPresent()); + + NodeAgentContext context1 = generateContext(); + manager.scheduleTickWith(context1); + + async.awaitResult(); + assertEquals(Optional.of(context1), async.response); + assertFalse(async.exception.isPresent()); + } + + @Test(timeout = TIMEOUT) + public void blocks_in_nextContext_until_interrupt() throws InterruptedException { + AsyncExecutor<NodeAgentContext> async = new AsyncExecutor<>(manager::nextContext); + assertFalse(async.response.isPresent()); + Thread.sleep(10); + assertFalse(async.response.isPresent()); + + manager.interrupt(); + + async.awaitResult(); + assertEquals(Optional.of(InterruptedException.class), async.exception.map(Exception::getClass)); + assertFalse(async.response.isPresent()); + } + + @Test(timeout = TIMEOUT) + public void setFrozen_does_not_block_with_no_timeout() throws InterruptedException { + assertFalse(manager.setFrozen(false, Duration.ZERO)); + + // Generate new context and get it from the supplier, this completes the unfreeze + NodeAgentContext context1 = generateContext(); + manager.scheduleTickWith(context1); + assertSame(context1, manager.nextContext()); + + assertTrue(manager.setFrozen(false, Duration.ZERO)); + } + + @Test(timeout = TIMEOUT) + public void setFrozen_blocks_at_least_for_duration_of_timeout() { + long wantedDurationMillis = 100; + long start = clock.millis(); + assertFalse(manager.setFrozen(false, Duration.ofMillis(wantedDurationMillis))); + long actualDurationMillis = clock.millis() - start; + + assertTrue(actualDurationMillis >= wantedDurationMillis); + } + + @Test(timeout = TIMEOUT) + public void setFrozen_is_successful_if_converged_in_time() throws InterruptedException { + AsyncExecutor<Boolean> async = new AsyncExecutor<>(() -> manager.setFrozen(false, Duration.ofMillis(500))); + + assertFalse(async.response.isPresent()); + + NodeAgentContext context1 = generateContext(); + manager.scheduleTickWith(context1); + assertSame(context1, manager.nextContext()); + + async.awaitResult(); + assertEquals(Optional.of(true), async.response); + assertFalse(async.exception.isPresent()); + } + + private static NodeAgentContext generateContext() { + return new NodeAgentContextImpl.Builder("container-123.domain.tld").build(); + } + + private class AsyncExecutor<T> { + private final Object monitor = new Object(); + private final Thread thread; + private volatile Optional<T> response = Optional.empty(); + private volatile Optional<Exception> exception = Optional.empty(); + private boolean completed = false; + + private AsyncExecutor(ThrowingSupplier<T> supplier) { + this.thread = new Thread(() -> { + try { + response = Optional.of(supplier.get()); + } catch (Exception e) { + exception = Optional.of(e); + } + synchronized (monitor) { + completed = true; + monitor.notifyAll(); + } + }); + this.thread.start(); + } + + private void awaitResult() { + synchronized (monitor) { + while (!completed) { + try { + monitor.wait(); + } catch (InterruptedException ignored) { } + } + } + } + } + + private interface ThrowingSupplier<T> { + T get() throws Exception; + } +}
\ No newline at end of file diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index b6128fc8693..e392ac34414 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -4,8 +4,8 @@ package com.yahoo.vespa.hosted.node.admin.nodeagent; import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.config.provision.NodeType; import com.yahoo.metrics.simple.MetricReceiver; -import com.yahoo.test.ManualClock; import com.yahoo.vespa.hosted.dockerapi.Container; +import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.dockerapi.ContainerResources; import com.yahoo.vespa.hosted.dockerapi.ContainerStats; import com.yahoo.vespa.hosted.dockerapi.exception.DockerException; @@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.dockerapi.DockerImage; import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeAttributes; +import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException; import com.yahoo.vespa.hosted.node.admin.docker.DockerOperations; import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; import com.yahoo.vespa.hosted.node.admin.maintenance.acl.AclMaintainer; @@ -28,7 +29,6 @@ import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.time.Duration; import java.util.Collections; import java.util.Map; import java.util.Optional; @@ -36,8 +36,6 @@ import java.util.Set; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; @@ -56,14 +54,21 @@ import static org.mockito.Mockito.when; * @author Øyvind Bakksjø */ public class NodeAgentImplTest { - private static final Duration NODE_AGENT_SCAN_INTERVAL = Duration.ofSeconds(30); private static final double MIN_CPU_CORES = 2; private static final double MIN_MAIN_MEMORY_AVAILABLE_GB = 16; private static final double MIN_DISK_AVAILABLE_GB = 250; private static final String vespaVersion = "1.2.3"; private final String hostName = "host1.test.yahoo.com"; - private final NodeAgentContext context = new NodeAgentContextImpl.Builder(hostName).build(); + private final NodeSpec.Builder nodeBuilder = new NodeSpec.Builder() + .hostname(hostName) + .nodeType(NodeType.tenant) + .flavor("docker") + .minCpuCores(MIN_CPU_CORES) + .minMainMemoryAvailableGb(MIN_MAIN_MEMORY_AVAILABLE_GB) + .minDiskAvailableGb(MIN_DISK_AVAILABLE_GB); + + private final NodeAgentContextSupplier contextSupplier = mock(NodeAgentContextSupplier.class); private final DockerImage dockerImage = new DockerImage("dockerImage"); private final DockerOperations dockerOperations = mock(DockerOperations.class); private final NodeRepository nodeRepository = mock(NodeRepository.class); @@ -76,16 +81,6 @@ public class NodeAgentImplTest { Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap()); private final AthenzCredentialsMaintainer athenzCredentialsMaintainer = mock(AthenzCredentialsMaintainer.class); - private final ManualClock clock = new ManualClock(); - - private final NodeSpec.Builder nodeBuilder = new NodeSpec.Builder() - .hostname(context.hostname().value()) - .nodeType(NodeType.tenant) - .flavor("docker") - .minCpuCores(MIN_CPU_CORES) - .minMainMemoryAvailableGb(MIN_MAIN_MEMORY_AVAILABLE_GB) - .minDiskAvailableGb(MIN_DISK_AVAILABLE_GB); - @Test public void upToDateContainerIsUntouched() { @@ -97,11 +92,12 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(187500000000L)); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(dockerOperations, never()).removeContainer(eq(context), any()); verify(orchestrator, never()).suspend(any(String.class)); @@ -125,11 +121,12 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(217432719360L)); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(storageMaintainer, times(1)).removeOldFilesFromNode(eq(context)); } @@ -145,27 +142,28 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(187500000000L)); - nodeAgent.converge(); + nodeAgent.doConverge(context); inOrder.verify(dockerOperations, never()).startServices(eq(context)); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context)); nodeAgent.suspend(); - nodeAgent.converge(); + nodeAgent.doConverge(context); inOrder.verify(dockerOperations, never()).startServices(eq(context)); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context)); // Expect a resume, but no start services // No new suspends/stops, so no need to resume/start - nodeAgent.converge(); + nodeAgent.doConverge(context); inOrder.verify(dockerOperations, never()).startServices(eq(context)); inOrder.verify(dockerOperations, never()).resumeNode(eq(context)); nodeAgent.suspend(); nodeAgent.stopServices(); - nodeAgent.converge(); + nodeAgent.doConverge(context); inOrder.verify(dockerOperations, times(1)).startServices(eq(context)); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context)); } @@ -181,13 +179,14 @@ public class NodeAgentImplTest { .currentRestartGeneration(restartGeneration.get()) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(null, false); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(dockerOperations, never()).removeContainer(eq(context), any()); verify(dockerOperations, never()).startServices(any()); @@ -195,7 +194,7 @@ public class NodeAgentImplTest { final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository, aclMaintainer, healthChecker); inOrder.verify(dockerOperations, times(1)).pullImageAsyncIfNeeded(eq(dockerImage)); - inOrder.verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); + inOrder.verify(dockerOperations, times(1)).createContainer(eq(context), any()); inOrder.verify(dockerOperations, times(1)).startContainer(eq(context)); inOrder.verify(aclMaintainer, times(1)).converge(); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context)); @@ -216,13 +215,14 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(orchestrator, never()).suspend(any(String.class)); verify(orchestrator, never()).resume(any(String.class)); @@ -241,29 +241,25 @@ public class NodeAgentImplTest { .wantedVespaVersion(vespaVersion) .vespaVersion(vespaVersion); + NodeAgentContext firstContext = createContext(specBuilder.build()); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); - NodeSpec firstSpec = specBuilder.build(); - NodeSpec secondSpec = specBuilder.minDiskAvailableGb(200).build(); - NodeSpec thirdSpec = specBuilder.minCpuCores(4).build(); - - when(nodeRepository.getOptionalNode(hostName)) - .thenReturn(Optional.of(firstSpec)) - .thenReturn(Optional.of(secondSpec)) - .thenReturn(Optional.of(thirdSpec)); + when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); + when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L)); - nodeAgent.converge(); - nodeAgent.converge(); - nodeAgent.converge(); + nodeAgent.doConverge(firstContext); + NodeAgentContext secondContext = createContext(specBuilder.minDiskAvailableGb(200).build()); + nodeAgent.doConverge(secondContext); + NodeAgentContext thirdContext = createContext(specBuilder.minCpuCores(4).build()); + nodeAgent.doConverge(thirdContext); InOrder inOrder = inOrder(orchestrator, dockerOperations); inOrder.verify(orchestrator).resume(any(String.class)); inOrder.verify(orchestrator).resume(any(String.class)); inOrder.verify(orchestrator).suspend(any(String.class)); - inOrder.verify(dockerOperations).removeContainer(eq(context), any()); - inOrder.verify(dockerOperations, times(1)).createContainer(eq(context), eq(thirdSpec), any()); - inOrder.verify(dockerOperations).startContainer(eq(context)); + inOrder.verify(dockerOperations).removeContainer(eq(thirdContext), any()); + inOrder.verify(dockerOperations, times(1)).createContainer(eq(thirdContext), any()); + inOrder.verify(dockerOperations).startContainer(eq(thirdContext)); inOrder.verify(orchestrator).resume(any(String.class)); } @@ -281,14 +277,16 @@ public class NodeAgentImplTest { .currentRestartGeneration(currentRestartGeneration) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); + doThrow(new OrchestratorException("Denied")).when(orchestrator).suspend(eq(hostName)); try { - nodeAgent.converge(); + nodeAgent.doConverge(context); fail("Expected to throw an exception"); - } catch (Exception ignored) { } + } catch (OrchestratorException ignored) { } - verify(dockerOperations, never()).createContainer(eq(context), eq(node), any()); + verify(dockerOperations, never()).createContainer(eq(context), any()); verify(dockerOperations, never()).startContainer(eq(context)); verify(orchestrator, never()).resume(any(String.class)); verify(nodeRepository, never()).updateNodeAttributes(any(String.class), any(NodeAttributes.class)); @@ -308,6 +306,7 @@ public class NodeAgentImplTest { .currentRebootGeneration(currentRebootGeneration) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); @@ -317,22 +316,22 @@ public class NodeAgentImplTest { .when(healthChecker).verifyHealth(eq(context)); try { - nodeAgent.converge(); + nodeAgent.doConverge(context); } catch (ConvergenceException ignored) {} // First time we fail to resume because health verification fails verify(orchestrator, times(1)).suspend(eq(hostName)); verify(dockerOperations, times(1)).removeContainer(eq(context), any()); - verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); + verify(dockerOperations, times(1)).createContainer(eq(context), any()); verify(dockerOperations, times(1)).startContainer(eq(context)); verify(orchestrator, never()).resume(eq(hostName)); verify(nodeRepository, never()).updateNodeAttributes(any(), any()); - nodeAgent.converge(); + nodeAgent.doConverge(context); // Do not reboot the container again verify(dockerOperations, times(1)).removeContainer(eq(context), any()); - verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); + verify(dockerOperations, times(1)).createContainer(eq(context), any()); verify(orchestrator, times(1)).resume(eq(hostName)); verify(nodeRepository, times(1)).updateNodeAttributes(eq(hostName), eq(new NodeAttributes() .withRebootGeneration(wantedRebootGeneration))); @@ -348,11 +347,12 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(dockerOperations, never()).removeContainer(eq(context), any()); verify(orchestrator, never()).resume(any(String.class)); @@ -365,18 +365,19 @@ public class NodeAgentImplTest { .state(Node.State.ready) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(null,false); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - nodeAgent.converge(); - nodeAgent.converge(); - nodeAgent.converge(); + nodeAgent.doConverge(context); + nodeAgent.doConverge(context); + nodeAgent.doConverge(context); // Should only be called once, when we initialize verify(dockerOperations, times(1)).getContainer(eq(context)); verify(dockerOperations, never()).removeContainer(eq(context), any()); - verify(dockerOperations, never()).createContainer(eq(context), eq(node), any()); + verify(dockerOperations, never()).createContainer(eq(context), any()); verify(dockerOperations, never()).startContainer(eq(context)); verify(orchestrator, never()).resume(any(String.class)); verify(nodeRepository, never()).updateNodeAttributes(eq(hostName), any()); @@ -392,11 +393,12 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - nodeAgent.converge(); + nodeAgent.doConverge(context); final InOrder inOrder = inOrder(storageMaintainer, dockerOperations); inOrder.verify(dockerOperations, never()).removeContainer(eq(context), any()); @@ -413,11 +415,12 @@ public class NodeAgentImplTest { .wantedVespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(null, false); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(nodeRepository, never()).updateNodeAttributes(eq(hostName), any()); } @@ -433,20 +436,21 @@ public class NodeAgentImplTest { .state(nodeState) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - nodeAgent.converge(); + nodeAgent.doConverge(context); final InOrder inOrder = inOrder(storageMaintainer, dockerOperations, nodeRepository); inOrder.verify(dockerOperations, times(1)).stopServices(eq(context)); - inOrder.verify(storageMaintainer, times(1)).handleCoreDumpsForContainer(eq(context), eq(node), any()); + inOrder.verify(storageMaintainer, times(1)).handleCoreDumpsForContainer(eq(context), any()); inOrder.verify(dockerOperations, times(1)).removeContainer(eq(context), any()); inOrder.verify(storageMaintainer, times(1)).archiveNodeStorage(eq(context)); inOrder.verify(nodeRepository, times(1)).setNodeState(eq(hostName), eq(Node.State.ready)); - verify(dockerOperations, never()).createContainer(eq(context), any(), any()); + verify(dockerOperations, never()).createContainer(eq(context), any()); verify(dockerOperations, never()).startContainer(eq(context)); verify(dockerOperations, never()).suspendNode(eq(context)); verify(dockerOperations, times(1)).stopServices(eq(context)); @@ -474,10 +478,11 @@ public class NodeAgentImplTest { .state(Node.State.provisioned) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(null, false); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(nodeRepository, times(1)).setNodeState(eq(hostName), eq(Node.State.dirty)); } @@ -490,15 +495,16 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, false); when(nodeRepository.getOptionalNode(eq(hostName))).thenReturn(Optional.of(node)); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); - nodeAgent.tick(); + nodeAgent.doConverge(context); verify(dockerOperations, times(1)).removeContainer(eq(context), any()); - verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); + verify(dockerOperations, times(1)).createContainer(eq(context), any()); verify(dockerOperations, times(1)).startContainer(eq(context)); } @@ -511,6 +517,7 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(eq(hostName))).thenReturn(Optional.of(node)); @@ -523,7 +530,7 @@ public class NodeAgentImplTest { // 1st try try { - nodeAgent.converge(); + nodeAgent.doConverge(context); fail("Expected to throw an exception"); } catch (RuntimeException ignored) { } @@ -531,7 +538,7 @@ public class NodeAgentImplTest { inOrder.verifyNoMoreInteractions(); // 2nd try - nodeAgent.converge(); + nodeAgent.doConverge(context); inOrder.verify(dockerOperations).resumeNode(any()); inOrder.verify(orchestrator).resume(hostName); @@ -539,33 +546,6 @@ public class NodeAgentImplTest { } @Test - public void testSetFrozen() { - NodeAgentImpl nodeAgent = spy(makeNodeAgent(dockerImage, true)); - doNothing().when(nodeAgent).converge(); - - nodeAgent.tick(); - verify(nodeAgent, times(1)).converge(); - - assertFalse(nodeAgent.setFrozen(true)); // Returns true because we are not frozen until tick is called - nodeAgent.tick(); - verify(nodeAgent, times(1)).converge(); // Frozen should be set, therefore converge is never called - - assertTrue(nodeAgent.setFrozen(true)); // Attempt to set frozen again, but it's already set - clock.advance(Duration.ofSeconds(35)); // workToDoNow is no longer set, so we need to wait the regular time - nodeAgent.tick(); - verify(nodeAgent, times(1)).converge(); - - assertFalse(nodeAgent.setFrozen(false)); // Unfreeze, but still need to call tick for it to take effect - nodeAgent.tick(); - verify(nodeAgent, times(2)).converge(); - - assertTrue(nodeAgent.setFrozen(false)); - clock.advance(Duration.ofSeconds(35)); // workToDoNow is no longer set, so we need to wait the regular time - nodeAgent.tick(); - verify(nodeAgent, times(3)).converge(); - } - - @Test public void start_container_subtask_failure_leads_to_container_restart() { final NodeSpec node = nodeBuilder .wantedDockerImage(dockerImage) @@ -573,30 +553,30 @@ public class NodeAgentImplTest { .wantedVespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = spy(makeNodeAgent(null, false)); - when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); doThrow(new DockerException("Failed to set up network")).doNothing().when(dockerOperations).startContainer(eq(context)); try { - nodeAgent.converge(); + nodeAgent.doConverge(context); fail("Expected to get DockerException"); } catch (DockerException ignored) { } verify(dockerOperations, never()).removeContainer(eq(context), any()); - verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); + verify(dockerOperations, times(1)).createContainer(eq(context), any()); verify(dockerOperations, times(1)).startContainer(eq(context)); verify(nodeAgent, never()).resumeNodeIfNeeded(any()); // The docker container was actually started and is running, but subsequent exec calls to set up // networking failed mockGetContainer(dockerImage, true); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(dockerOperations, times(1)).removeContainer(eq(context), any()); - verify(dockerOperations, times(2)).createContainer(eq(context), eq(node), any()); + verify(dockerOperations, times(2)).createContainer(eq(context), any()); verify(dockerOperations, times(2)).startContainer(eq(context)); verify(nodeAgent, times(1)).resumeNodeIfNeeded(any()); } @@ -631,6 +611,7 @@ public class NodeAgentImplTest { .parentHostname("parent.host.name.yahoo.com") .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(eq(hostName))).thenReturn(Optional.of(node)); @@ -638,12 +619,9 @@ public class NodeAgentImplTest { when(dockerOperations.getContainerStats(eq(context))) .thenReturn(Optional.of(stats1)) .thenReturn(Optional.of(stats2)); - - nodeAgent.converge(); // Run the converge loop once to initialize lastNode + nodeAgent.updateContainerNodeMetrics(); // Update metrics once to init and lastCpuMetric - clock.advance(Duration.ofSeconds(1234)); - Path pathToExpectedMetrics = Paths.get(classLoader.getResource("expected.container.system.metrics.txt").getPath()); String expectedMetrics = new String(Files.readAllBytes(pathToExpectedMetrics)) .replaceAll("\\s", "") @@ -674,13 +652,11 @@ public class NodeAgentImplTest { .state(Node.State.ready) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(null, false); - when(nodeRepository.getOptionalNode(eq(hostName))).thenReturn(Optional.of(node)); when(dockerOperations.getContainerStats(eq(context))).thenReturn(Optional.empty()); - nodeAgent.converge(); // Run the converge loop once to initialize lastNode - nodeAgent.updateContainerNodeMetrics(); Set<Map<String, Object>> actualMetrics = metricReceiver.getAllMetricsRaw(); @@ -696,20 +672,21 @@ public class NodeAgentImplTest { .wantedVespaVersion(vespaVersion) .build(); + NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(null, false); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); - nodeAgent.converge(); + nodeAgent.doConverge(context); verify(dockerOperations, never()).removeContainer(eq(context), any()); verify(orchestrator, never()).suspend(any(String.class)); final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository, aclMaintainer); inOrder.verify(dockerOperations, times(1)).pullImageAsyncIfNeeded(eq(dockerImage)); - inOrder.verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); + inOrder.verify(dockerOperations, times(1)).createContainer(eq(context), any()); inOrder.verify(dockerOperations, times(1)).startContainer(eq(context)); inOrder.verify(aclMaintainer, times(1)).converge(); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context)); @@ -722,24 +699,33 @@ public class NodeAgentImplTest { mockGetContainer(dockerImage, isRunning); when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats)); - doNothing().when(storageMaintainer).writeMetricsConfig(any(), any()); + doNothing().when(storageMaintainer).writeMetricsConfig(any()); - return new NodeAgentImpl(context, nodeRepository, orchestrator, dockerOperations, - storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.of(athenzCredentialsMaintainer), Optional.of(aclMaintainer), + return new NodeAgentImpl(contextSupplier, nodeRepository, orchestrator, dockerOperations, + storageMaintainer, Optional.of(athenzCredentialsMaintainer), Optional.of(aclMaintainer), Optional.of(healthChecker)); } private void mockGetContainer(DockerImage dockerImage, boolean isRunning) { - Optional<Container> container = dockerImage != null ? - Optional.of(new Container( - hostName, - dockerImage, - ContainerResources.from(MIN_CPU_CORES, MIN_MAIN_MEMORY_AVAILABLE_GB), - context.containerName(), - isRunning ? Container.State.RUNNING : Container.State.EXITED, - isRunning ? 1 : 0)) : - Optional.empty(); - - when(dockerOperations.getContainer(eq(context))).thenReturn(container); + doAnswer(invoc -> { + NodeAgentContext context = invoc.getArgument(0); + if (!hostName.equals(context.hostname().value())) + throw new IllegalArgumentException(); + return dockerImage != null ? + Optional.of(new Container( + hostName, + dockerImage, + ContainerResources.from(MIN_CPU_CORES, MIN_MAIN_MEMORY_AVAILABLE_GB), + ContainerName.fromHostname(hostName), + isRunning ? Container.State.RUNNING : Container.State.EXITED, + isRunning ? 1 : 0)) : + Optional.empty(); + }).when(dockerOperations).getContainer(any()); + } + + private NodeAgentContext createContext(NodeSpec nodeSpec) { + NodeAgentContext context = new NodeAgentContextImpl.Builder(nodeSpec).build(); + when(contextSupplier.currentContext()).thenReturn(context); + return context; } } |