diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-10-13 17:28:27 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-13 17:28:27 +0200 |
commit | 6e7459c9eff635d9c8227cd8d1add320f298c0e2 (patch) | |
tree | c309739b208e25daeb3efc907ea311d9fccf9a5e | |
parent | 92014fad54908e405ceaa254631b5af2d07806cf (diff) | |
parent | e3676a710abe26a3293ce5a9d676d62b3b388bac (diff) |
Merge pull request #24434 from vespa-engine/freva/allocation-fixes
Remove HostLivenessTracker
17 files changed, 73 insertions, 396 deletions
diff --git a/config-provisioning/src/main/java/com/yahoo/config/provision/HostLivenessTracker.java b/config-provisioning/src/main/java/com/yahoo/config/provision/HostLivenessTracker.java deleted file mode 100644 index f3326ec2bc8..00000000000 --- a/config-provisioning/src/main/java/com/yahoo/config/provision/HostLivenessTracker.java +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.config.provision; - -import java.time.Instant; -import java.util.Optional; - -/** - * Instances of this are used to keep track of (notify and query) - * which hosts are currently connected to the config system. - * - * @author bratseth - */ -public interface HostLivenessTracker { - - /** Called each time a config request is received from a client */ - void receivedRequestFrom(String hostname); - - /** - * Returns the epoch timestamp of the last request received from the given hostname, - * or empty if there is no memory of this host making a request - */ - Optional<Instant> lastRequestFrom(String hostname); - -} diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/host/ConfigRequestHostLivenessTracker.java b/configserver/src/main/java/com/yahoo/vespa/config/server/host/ConfigRequestHostLivenessTracker.java deleted file mode 100644 index 9195f78b23b..00000000000 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/host/ConfigRequestHostLivenessTracker.java +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.config.server.host; - -import com.yahoo.component.annotation.Inject; -import com.yahoo.config.provision.HostLivenessTracker; - -import java.time.Clock; -import java.time.Instant; -import java.util.Map; -import java.util.Optional; -import java.util.concurrent.ConcurrentHashMap; - -/** - * Keeps track of the last config request made by each hostname. - * This always remembers all requests forever since the moment is is constructed. - * This is the implementation which will be injected to components who request a HostLivenessTracker. - * - * @author bratseth - */ -public class ConfigRequestHostLivenessTracker implements HostLivenessTracker { - - private final Clock clock; - private final Map<String, Instant> lastRequestFromHost = new ConcurrentHashMap<>(); - - @Inject - @SuppressWarnings("unused") - public ConfigRequestHostLivenessTracker() { - this(Clock.systemUTC()); - } - - public ConfigRequestHostLivenessTracker(Clock clock) { - this.clock = clock; - } - - @Override - public void receivedRequestFrom(String hostname) { - lastRequestFromHost.put(hostname, clock.instant()); - } - - @Override - public Optional<Instant> lastRequestFrom(String hostname) { - return Optional.ofNullable(lastRequestFromHost.get(hostname)); - } - -} diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/rpc/GetConfigProcessor.java b/configserver/src/main/java/com/yahoo/vespa/config/server/rpc/GetConfigProcessor.java index b7327ef3aa7..1c419ce047a 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/rpc/GetConfigProcessor.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/rpc/GetConfigProcessor.java @@ -20,6 +20,7 @@ import com.yahoo.vespa.config.protocol.VespaVersion; import com.yahoo.vespa.config.server.GetConfigContext; import com.yahoo.vespa.config.server.UnknownConfigDefinitionException; import com.yahoo.vespa.config.server.tenant.TenantRepository; + import java.util.Optional; import java.util.Set; import java.util.logging.Level; @@ -150,7 +151,6 @@ class GetConfigProcessor implements Runnable { @Override public void run() { - rpcServer.hostLivenessTracker().receivedRequestFrom(request.getClientHostName()); Pair<GetConfigContext, Long> delayed = getConfig(request); if (delayed != null) { diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/rpc/RpcServer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/rpc/RpcServer.java index b36967d76a4..a2461706f11 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/rpc/RpcServer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/rpc/RpcServer.java @@ -7,7 +7,6 @@ import com.yahoo.component.annotation.Inject; import com.yahoo.concurrent.ThreadFactoryFactory; import com.yahoo.config.FileReference; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.TenantName; import com.yahoo.jrt.Acceptor; import com.yahoo.jrt.DataValue; @@ -27,8 +26,8 @@ import com.yahoo.vespa.config.protocol.ConfigResponse; import com.yahoo.vespa.config.protocol.JRTServerConfigRequest; import com.yahoo.vespa.config.protocol.JRTServerConfigRequestV3; import com.yahoo.vespa.config.protocol.Trace; -import com.yahoo.vespa.config.server.GetConfigContext; import com.yahoo.vespa.config.server.ConfigActivationListener; +import com.yahoo.vespa.config.server.GetConfigContext; import com.yahoo.vespa.config.server.RequestHandler; import com.yahoo.vespa.config.server.SuperModelRequestHandler; import com.yahoo.vespa.config.server.application.ApplicationSet; @@ -44,6 +43,7 @@ import com.yahoo.vespa.filedistribution.FileDownloader; import com.yahoo.vespa.filedistribution.FileReceiver; import com.yahoo.vespa.filedistribution.FileReferenceData; import com.yahoo.vespa.filedistribution.FileReferenceDownload; + import java.nio.ByteBuffer; import java.time.Duration; import java.util.Arrays; @@ -102,7 +102,6 @@ public class RpcServer implements Runnable, ConfigActivationListener, TenantList private final SuperModelRequestHandler superModelRequestHandler; private final MetricUpdater metrics; private final MetricUpdaterFactory metricUpdaterFactory; - private final HostLivenessTracker hostLivenessTracker; private final FileServer fileServer; private final RpcAuthorizer rpcAuthorizer; @@ -128,13 +127,12 @@ public class RpcServer implements Runnable, ConfigActivationListener, TenantList @Inject public RpcServer(ConfigserverConfig config, SuperModelRequestHandler superModelRequestHandler, MetricUpdaterFactory metrics, HostRegistry hostRegistry, - HostLivenessTracker hostLivenessTracker, FileServer fileServer, RpcAuthorizer rpcAuthorizer, + FileServer fileServer, RpcAuthorizer rpcAuthorizer, RpcRequestHandlerProvider handlerProvider) { this.superModelRequestHandler = superModelRequestHandler; metricUpdaterFactory = metrics; supervisor.setMaxOutputBufferSize(config.maxoutputbuffersize()); this.metrics = metrics.getOrCreateMetricUpdater(Collections.emptyMap()); - this.hostLivenessTracker = hostLivenessTracker; BlockingQueue<Runnable> workQueue = new LinkedBlockingQueue<>(config.maxgetconfigclients()); int rpcWorkerThreads = (config.numRpcThreads() == 0) ? threadsToUse() : config.numRpcThreads(); executorService = new ThreadPoolExecutor(rpcWorkerThreads, rpcWorkerThreads, @@ -613,8 +611,4 @@ public class RpcServer implements Runnable, ConfigActivationListener, TenantList req.returnValues().add(new Int32Value(0)); }); } - - HostLivenessTracker hostLivenessTracker() { - return hostLivenessTracker; - } } diff --git a/configserver/src/main/resources/configserver-app/services.xml b/configserver/src/main/resources/configserver-app/services.xml index 650176829e6..b8397722b3d 100644 --- a/configserver/src/main/resources/configserver-app/services.xml +++ b/configserver/src/main/resources/configserver-app/services.xml @@ -30,7 +30,6 @@ <component id="com.yahoo.vespa.config.server.host.HostRegistry" bundle="configserver" /> <component id="com.yahoo.vespa.config.server.ApplicationRepository" bundle="configserver" /> <component id="com.yahoo.vespa.config.server.version.VersionState" bundle="configserver" /> - <component id="com.yahoo.vespa.config.server.host.ConfigRequestHostLivenessTracker" bundle="configserver" /> <component id="com.yahoo.config.provision.Zone" bundle="config-provisioning" /> <component id="com.yahoo.vespa.config.server.application.ConfigConvergenceChecker" bundle="configserver" /> <component id="com.yahoo.vespa.config.server.application.HttpProxy" bundle="configserver" /> diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/rpc/MockRpcServer.java b/configserver/src/test/java/com/yahoo/vespa/config/server/rpc/MockRpcServer.java index 3272689473e..0f9ce9eff13 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/rpc/MockRpcServer.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/rpc/MockRpcServer.java @@ -7,7 +7,6 @@ import com.yahoo.vespa.config.protocol.ConfigResponse; import com.yahoo.vespa.config.protocol.JRTServerConfigRequest; import com.yahoo.vespa.config.server.GetConfigContext; import com.yahoo.vespa.config.server.filedistribution.FileServer; -import com.yahoo.vespa.config.server.host.ConfigRequestHostLivenessTracker; import com.yahoo.vespa.config.server.host.HostRegistry; import com.yahoo.vespa.config.server.monitoring.Metrics; import com.yahoo.vespa.config.server.rpc.security.NoopRpcAuthorizer; @@ -38,7 +37,6 @@ public class MockRpcServer extends RpcServer { null, Metrics.createTestMetrics(), new HostRegistry(), - new ConfigRequestHostLivenessTracker(), new FileServer(tempDir), new NoopRpcAuthorizer(), new RpcRequestHandlerProvider()); diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/rpc/RpcTester.java b/configserver/src/test/java/com/yahoo/vespa/config/server/rpc/RpcTester.java index 441f6c3a6ce..e5ed4e4673d 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/rpc/RpcTester.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/rpc/RpcTester.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.config.server.rpc; import com.yahoo.cloud.config.ConfigserverConfig; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.TenantName; import com.yahoo.config.provision.Zone; import com.yahoo.jrt.Request; @@ -21,7 +20,6 @@ import com.yahoo.vespa.config.server.SuperModelRequestHandler; import com.yahoo.vespa.config.server.TestConfigDefinitionRepo; import com.yahoo.vespa.config.server.application.OrchestratorMock; import com.yahoo.vespa.config.server.filedistribution.FileServer; -import com.yahoo.vespa.config.server.host.ConfigRequestHostLivenessTracker; import com.yahoo.vespa.config.server.host.HostRegistry; import com.yahoo.vespa.config.server.monitoring.Metrics; import com.yahoo.vespa.config.server.rpc.security.NoopRpcAuthorizer; @@ -51,7 +49,6 @@ public class RpcTester implements AutoCloseable { private final ManualClock clock = new ManualClock(Instant.ofEpochMilli(100)); private final String myHostname = HostName.getLocalhost(); - private final HostLivenessTracker hostLivenessTracker = new ConfigRequestHostLivenessTracker(clock); private final Spec spec; private final RpcServer rpcServer; @@ -95,7 +92,6 @@ public class RpcTester implements AutoCloseable { .withProvisioner(new MockProvisioner()) .withOrchestrator(new OrchestratorMock()) .build(); - assertFalse(hostLivenessTracker.lastRequestFrom(myHostname).isPresent()); } public void close() { @@ -122,7 +118,6 @@ public class RpcTester implements AutoCloseable { new InMemoryFlagSource())), Metrics.createTestMetrics(), hostRegistry, - hostLivenessTracker, new FileServer(temporaryFolder.newFolder()), new NoopRpcAuthorizer(), new RpcRequestHandlerProvider()); @@ -167,8 +162,6 @@ public class RpcTester implements AutoCloseable { void performRequest(Request req) { clock.advance(Duration.ofMillis(10)); sup.connect(spec).invokeSync(req, Duration.ofSeconds(10)); - if (req.methodName().equals(RpcServer.getConfigMethodName)) - assertEquals(clock.instant(), hostLivenessTracker.lastRequestFrom(myHostname).get()); } RpcServer rpcServer() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 3e7abe8f053..32eac49a288 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -77,23 +77,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { int throttledHostFailures = 0; int throttledNodeFailures = 0; - // Ready nodes - try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { - for (FailingNode failing : findReadyFailingNodes()) { - attempts++; - if (throttle(failing.node())) { - failures++; - if (failing.node().type().isHost()) - throttledHostFailures++; - else - throttledNodeFailures++; - continue; - } - nodeRepository().nodes().fail(failing.node().hostname(), Agent.NodeFailer, failing.reason()); - } - } - - // Active nodes for (FailingNode failing : findActiveFailingNodes()) { attempts++; if (!failAllowedFor(failing.node().type())) continue; @@ -116,22 +99,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { return asSuccessFactor(attempts, failures); } - private Collection<FailingNode> findReadyFailingNodes() { - Set<FailingNode> failingNodes = new HashSet<>(); - for (Node node : nodeRepository().nodes().list(Node.State.ready)) { - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node); - List<String> failureReports = reasonsToFailHost(hostNode); - if (failureReports.size() > 0) { - if (hostNode.equals(node)) { - failingNodes.add(new FailingNode(node, "Host has failure reports: " + failureReports)); - } else { - failingNodes.add(new FailingNode(node, "Parent (" + hostNode + ") has failure reports: " + failureReports)); - } - } - } - return failingNodes; - } - private Collection<FailingNode> findActiveFailingNodes() { Set<FailingNode> failingNodes = new HashSet<>(); NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); @@ -150,7 +117,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { for (Node node : activeNodes) { if (allSuspended(node, activeNodes)) { - Node host = node.parentHostname().flatMap(parent -> activeNodes.node(parent)).orElse(node); + Node host = node.parentHostname().flatMap(activeNodes::node).orElse(node); if (host.type().isHost()) { List<String> failureReports = reasonsToFailHost(host); if ( ! failureReports.isEmpty()) { @@ -175,7 +142,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { /** Returns whether node has any kind of hardware issue */ static boolean hasHardwareIssue(Node node, NodeList allNodes) { - Node host = node.parentHostname().flatMap(parent -> allNodes.node(parent)).orElse(node); + Node host = node.parentHostname().flatMap(allNodes::node).orElse(node); return reasonsToFailHost(host).size() > 0; } @@ -344,30 +311,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { } - private static class FailingNode { - - private final Node node; - private final String reason; - - public FailingNode(Node node, String reason) { - this.node = node; - this.reason = reason; - } - - public Node node() { return node; } - public String reason() { return reason; } - - @Override - public boolean equals(Object other) { - if ( ! (other instanceof FailingNode)) return false; - return ((FailingNode)other).node().equals(this.node()); - } - - @Override - public int hashCode() { - return node.hashCode(); - } - - } + private record FailingNode(Node node, String reason) { } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index b43e2ae051f..624492a14f3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ApplicationLockException; -import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.jdisc.Metric; import com.yahoo.lang.MutableInteger; import com.yahoo.transaction.Mutex; @@ -13,12 +12,10 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; -import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.service.monitor.ServiceMonitor; import com.yahoo.yolean.Exceptions; import java.time.Duration; -import java.time.Instant; import java.util.List; import java.util.Map; import java.util.Optional; @@ -34,41 +31,18 @@ import static java.util.stream.Collectors.counting; */ public class NodeHealthTracker extends NodeRepositoryMaintainer { - /** Provides information about the status of ready hosts */ - private final HostLivenessTracker hostLivenessTracker; - /** Provides (more accurate) information about the status of active hosts */ private final ServiceMonitor serviceMonitor; - public NodeHealthTracker(HostLivenessTracker hostLivenessTracker, - ServiceMonitor serviceMonitor, NodeRepository nodeRepository, + public NodeHealthTracker(ServiceMonitor serviceMonitor, NodeRepository nodeRepository, Duration interval, Metric metric) { super(nodeRepository, interval, metric); - this.hostLivenessTracker = hostLivenessTracker; this.serviceMonitor = serviceMonitor; } @Override protected double maintain() { - return ( updateReadyNodeLivenessEvents() + updateActiveNodeDownState() ) / 2; - } - - private double updateReadyNodeLivenessEvents() { - // Update node last request events through ZooKeeper to collect request to all config servers. - // We do this here ("lazily") to avoid writing to zk for each config request. - try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { - for (Node node : nodeRepository().nodes().list(Node.State.ready)) { - Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname()); - if (lastLocalRequest.isEmpty()) continue; - - if (!node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) { - History updatedHistory = node.history() - .with(new History.Event(History.Event.Type.requested, Agent.NodeHealthTracker, lastLocalRequest.get())); - nodeRepository().nodes().write(node.with(updatedHistory), lock); - } - } - } - return 1.0; + return updateActiveNodeDownState(); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index dac6ee61ef3..708d8b59eb0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -5,7 +5,6 @@ import com.yahoo.component.AbstractComponent; import com.yahoo.component.annotation.Inject; import com.yahoo.concurrent.maintenance.Maintainer; import com.yahoo.config.provision.Deployer; -import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.InfraDeployer; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.Zone; @@ -33,7 +32,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { @SuppressWarnings("unused") @Inject public NodeRepositoryMaintenance(NodeRepository nodeRepository, Deployer deployer, InfraDeployer infraDeployer, - HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, + ServiceMonitor serviceMonitor, Zone zone, Metric metric, ProvisionServiceProvider provisionServiceProvider, FlagSource flagSource, MetricsFetcher metricsFetcher) { @@ -46,7 +45,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { maintainers.add(infrastructureProvisioner); maintainers.add(new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, defaults.throttlePolicy, metric)); - maintainers.add(new NodeHealthTracker(hostLivenessTracker, serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric)); + maintainers.add(new NodeHealthTracker(serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric)); maintainers.add(new ExpeditedChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.expeditedChangeRedeployInterval)); maintainers.add(new ReservationExpirer(nodeRepository, defaults.reservationExpiry, metric)); maintainers.add(new RetiredExpirer(nodeRepository, deployer, metric, defaults.retiredInterval, defaults.retiredExpiry)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index 14fa2e1c8ff..4aca1cbd056 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -46,8 +46,10 @@ public class History { private static ImmutableMap<Event.Type, Event> toImmutableMap(Collection<Event> events) { ImmutableMap.Builder<Event.Type, Event> builder = new ImmutableMap.Builder<>(); - for (Event event : events) + for (Event event : events) { + if (event.type() == Event.Type.requested) continue; // TODO (freva): Remove requested event after 8.70 builder.put(event.type(), event); + } return builder.build(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index b4e304155a6..a5ebc6b3efc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.persistence; +import ai.vespa.http.DomainName; import com.yahoo.component.Version; import com.yahoo.concurrent.UncheckedTimeoutException; import com.yahoo.config.provision.ApplicationId; @@ -484,15 +485,12 @@ public class CuratorDatabaseClient { transaction.onCommitted(() -> { for (var lb : loadBalancers) { if (lb.state() == fromState) continue; + Optional<String> target = lb.instance().flatMap(instance -> instance.hostname().map(DomainName::value).or(instance::ipAddress)); if (fromState == null) { - log.log(Level.INFO, () -> "Creating " + lb.id() + lb.instance() - .map(instance -> " (" + instance.hostname() + ")") - .orElse("") + + log.log(Level.INFO, () -> "Creating " + lb.id() + target.map(t -> " (" + t + ")").orElse("") + " in " + lb.state()); } else { - log.log(Level.INFO, () -> "Moving " + lb.id() + lb.instance() - .map(instance -> " (" + instance.hostname() + ")") - .orElse("") + + log.log(Level.INFO, () -> "Moving " + lb.id() + target.map(t -> " (" + t + ")").orElse("") + " from " + fromState + " to " + lb.state()); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java index 83a80847ec8..024f071abb1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java @@ -21,7 +21,6 @@ public class ContainerConfig { " <component id='com.yahoo.vespa.hosted.provision.testutils.MockDeployer'/>\n" + " <component id='com.yahoo.vespa.hosted.provision.testutils.MockInfraDeployer'/>\n" + " <component id='com.yahoo.vespa.hosted.provision.testutils.MockProvisioner'/>\n" + - " <component id='com.yahoo.vespa.hosted.provision.testutils.TestHostLivenessTracker'/>\n" + " <component id='com.yahoo.vespa.hosted.provision.testutils.ServiceMonitorStub'/>\n" + " <component id='com.yahoo.vespa.hosted.provision.testutils.MockDuperModel'/>\n" + " <component id='com.yahoo.vespa.hosted.provision.testutils.MockNodeFlavors'/>\n" + diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/TestHostLivenessTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/TestHostLivenessTracker.java deleted file mode 100644 index 28d0d5f89d7..00000000000 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/TestHostLivenessTracker.java +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.provision.testutils; - -import com.yahoo.config.provision.HostLivenessTracker; - -import java.time.Clock; -import java.time.Instant; -import java.util.HashMap; -import java.util.Map; -import java.util.Optional; - -/** This is a fully functional implementation */ -public class TestHostLivenessTracker implements HostLivenessTracker { - - private final Clock clock; - private final Map<String, Instant> lastRequestFromHost = new HashMap<>(); - - public TestHostLivenessTracker(Clock clock) { - this.clock = clock; - } - - @Override - public void receivedRequestFrom(String hostname) { - lastRequestFromHost.put(hostname, clock.instant()); - } - - @Override - public Optional<Instant> lastRequestFrom(String hostname) { - return Optional.ofNullable(lastRequestFromHost.get(hostname)); - } - -} diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index f67e9cd8345..977d72c11ea 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -27,7 +27,6 @@ import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; import com.yahoo.vespa.hosted.provision.testutils.MockDeployer; import com.yahoo.vespa.hosted.provision.testutils.ServiceMonitorStub; -import com.yahoo.vespa.hosted.provision.testutils.TestHostLivenessTracker; import java.time.Clock; import java.time.Duration; @@ -63,7 +62,6 @@ public class NodeFailTester { public ServiceMonitorStub serviceMonitor; public MockDeployer deployer; public TestMetric metric; - private final TestHostLivenessTracker hostLivenessTracker; private final NodeRepositoryProvisioner provisioner; private final Curator curator; @@ -74,7 +72,6 @@ public class NodeFailTester { curator = tester.getCurator(); nodeRepository = tester.nodeRepository(); provisioner = tester.provisioner(); - hostLivenessTracker = new TestHostLivenessTracker(clock); } private void initializeMaintainers(Map<ApplicationId, MockDeployer.ApplicationContext> apps) { @@ -112,7 +109,7 @@ public class NodeFailTester { /** Create hostCount hosts, one app with containerCount containers, and one app with contentCount content nodes. */ public static NodeFailTester withTwoApplications(int hostCount, int containerCount, int contentCount) { NodeFailTester tester = new NodeFailTester(); - tester.createHostNodes(hostCount); + tester.tester.makeReadyHosts(hostCount, new NodeResources(2, 8, 20, 10)); // Create tenant host application ClusterSpec clusterNodeAdminApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("node-admin")).vespaVersion("6.42").build(); @@ -139,13 +136,7 @@ public class NodeFailTester { public static NodeFailTester withTwoApplications(int numberOfHosts) { NodeFailTester tester = new NodeFailTester(); - - int nodesPerHost = 3; - List<Node> hosts = tester.createHostNodes(numberOfHosts); - for (int i = 0; i < hosts.size(); i++) { - tester.createReadyNodes(nodesPerHost, i * nodesPerHost, Optional.of("parent" + (i + 1)), - new NodeResources(1, 4, 100, 0.3), NodeType.tenant); - } + tester.tester.makeReadyNodes(numberOfHosts, new NodeResources(4, 16, 400, 10), NodeType.host, 8); // Create applications ClusterSpec clusterNodeAdminApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("node-admin")).vespaVersion("6.42").build(); @@ -230,26 +221,11 @@ public class NodeFailTester { } public NodeHealthTracker createUpdater() { - return new NodeHealthTracker(hostLivenessTracker, serviceMonitor, nodeRepository, Duration.ofMinutes(5), metric); - } - - public void allNodesMakeAConfigRequestExcept(Node ... deadNodeArray) { - allNodesMakeAConfigRequestExcept(List.of(deadNodeArray)); - } - - public void allNodesMakeAConfigRequestExcept(List<Node> deadNodes) { - for (Node node : nodeRepository.nodes().list()) { - if ( ! deadNodes.contains(node)) - hostLivenessTracker.receivedRequestFrom(node.hostname()); - } + return new NodeHealthTracker(serviceMonitor, nodeRepository, Duration.ofMinutes(5), metric); } public Clock clock() { return clock; } - public List<Node> createReadyNodes(int count) { - return createReadyNodes(count, 0); - } - public List<Node> createReadyNodes(int count, NodeResources resources) { return createReadyNodes(count, 0, resources); } @@ -258,22 +234,10 @@ public class NodeFailTester { return createReadyNodes(count, 0, Optional.empty(), hostFlavors.getFlavorOrThrow("default"), nodeType); } - public List<Node> createReadyNodes(int count, int startIndex) { - return createReadyNodes(count, startIndex, "default"); - } - - public List<Node> createReadyNodes(int count, int startIndex, String flavor) { - return createReadyNodes(count, startIndex, Optional.empty(), hostFlavors.getFlavorOrThrow(flavor), NodeType.tenant); - } - public List<Node> createReadyNodes(int count, int startIndex, NodeResources resources) { return createReadyNodes(count, startIndex, Optional.empty(), new Flavor(resources), NodeType.tenant); } - private List<Node> createReadyNodes(int count, int startIndex, Optional<String> parentHostname, NodeResources resources, NodeType nodeType) { - return createReadyNodes(count, startIndex, parentHostname, new Flavor(resources), nodeType); - } - private List<Node> createReadyNodes(int count, int startIndex, Optional<String> parentHostname, Flavor flavor, NodeType nodeType) { List<Node> nodes = new ArrayList<>(count); int lastOctetOfPoolAddress = 0; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 3ba536ee4d7..a21edf31cb8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -13,7 +13,6 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Report; -import com.yahoo.vespa.hosted.provision.node.Reports; import org.junit.Test; import java.time.Duration; @@ -21,6 +20,7 @@ import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Set; @@ -62,23 +62,21 @@ public class NodeFailerTest { } private void testNodeFailingWith(NodeFailTester tester, String hostWithHwFailure) { - // The host should have 2 nodes in active and 1 ready + // The host should have 2 nodes in active Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.nodes().list().childrenOf(hostWithHwFailure).asList().stream() .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); assertEquals(2, hostnamesByState.get(Node.State.active).size()); - assertEquals(1, hostnamesByState.get(Node.State.ready).size()); // Suspend the first of the active nodes tester.suspend(hostnamesByState.get(Node.State.active).get(0)); tester.runMaintainers(); tester.clock.advance(Duration.ofHours(25)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); // The first (and the only) ready node and the 1st active node that was allowed to fail should be failed Map<Node.State, List<String>> expectedHostnamesByState1Iter = Map.of( - Node.State.failed, List.of(hostnamesByState.get(Node.State.ready).get(0), hostnamesByState.get(Node.State.active).get(0)), + Node.State.failed, List.of(hostnamesByState.get(Node.State.active).get(0)), Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2)); Map<Node.State, List<String>> hostnamesByState1Iter = tester.nodeRepository.nodes().list().childrenOf(hostWithHwFailure).asList().stream() .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); @@ -88,7 +86,6 @@ public class NodeFailerTest { tester.suspend(hostnamesByState.get(Node.State.active).get(1)); tester.clock.advance(Duration.ofHours(25)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); // All of the children should be failed now @@ -101,7 +98,7 @@ public class NodeFailerTest { tester.suspend(hostWithHwFailure); tester.runMaintainers(); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(hostWithHwFailure).get().state()); - assertEquals(4, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).size()); } @Test @@ -110,14 +107,12 @@ public class NodeFailerTest { String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state()); - // The host has 2 nodes in active and 1 ready + // The host has 2 nodes in active Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.nodes().list().childrenOf(hostWithFailureReports).asList().stream() .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); assertEquals(2, hostnamesByState.get(Node.State.active).size()); String activeChild1 = hostnamesByState.get(Node.State.active).get(0); String activeChild2 = hostnamesByState.get(Node.State.active).get(1); - assertEquals(1, hostnamesByState.get(Node.State.ready).size()); - String readyChild = hostnamesByState.get(Node.State.ready).get(0); // Set failure report to the parent and all its children. Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low"); @@ -128,20 +123,16 @@ public class NodeFailerTest { tester.nodeRepository.nodes().write(updatedNode, () -> {}); }); - // The ready node will be failed, but neither the host nor the 2 active nodes since they have not been suspended - tester.allNodesMakeAConfigRequestExcept(); + // Neither the host nor the 2 active nodes are failed out because they have not been suspended tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyChild).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild1).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild2).get().state()); - // Suspending the host will not fail any more since none of the children are suspened + // Suspending the host will not fail any more since none of the children are suspended tester.suspend(hostWithFailureReports); tester.clock.advance(Duration.ofHours(25)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyChild).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild1).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild2).get().state()); @@ -149,9 +140,7 @@ public class NodeFailerTest { // Suspending one child node will fail that out. tester.suspend(activeChild1); tester.clock.advance(Duration.ofHours(25)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyChild).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(activeChild1).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild2).get().state()); @@ -159,9 +148,7 @@ public class NodeFailerTest { // Suspending the second child node will fail that out and the host. tester.suspend(activeChild2); tester.clock.advance(Duration.ofHours(25)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyChild).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(activeChild1).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(activeChild2).get().state()); @@ -210,31 +197,18 @@ public class NodeFailerTest { @Test public void node_failing() { - NodeFailTester tester = NodeFailTester.withTwoApplications(); + NodeFailTester tester = NodeFailTester.withTwoApplications(6); // For a day all nodes work so nothing happens for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); - assertEquals( 0, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertEquals( 4, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); + assertEquals(0, tester.deployer.redeployments); + assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); + assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); } - // Hardware failures are detected on two ready nodes, which are then failed - Node readyFail1 = tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).asList().get(2); - Node readyFail2 = tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).asList().get(3); - tester.nodeRepository.nodes().write(readyFail1.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); - tester.nodeRepository.nodes().write(readyFail2.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); - assertEquals(4, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); - tester.runMaintainers(); - assertEquals(2, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); - assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyFail1.hostname()).get().state()); - assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyFail2.hostname()).get().state()); - String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname(); tester.serviceMonitor.setHostDown(downHost1); @@ -243,41 +217,34 @@ public class NodeFailerTest { for (int minutes = 0; minutes < 45; minutes +=5 ) { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); - assertEquals( 0, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals( 2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertEquals( 2, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); + assertEquals(0, tester.deployer.redeployments); + assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); + assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); } tester.serviceMonitor.setHostUp(downHost1); // downHost2 should now be failed and replaced, but not downHost1 tester.clock.advance(Duration.ofDays(1)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals( 1, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals( 3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertEquals( 1, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); + assertEquals(1, tester.deployer.redeployments); + assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); + assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(downHost2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).asList().get(0).hostname()); // downHost1 fails again tester.serviceMonitor.setHostDown(downHost1); tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); // the system goes down tester.clock.advance(Duration.ofMinutes(120)); tester.failer = tester.createFailer(); tester.runMaintainers(); // the host is still down and fails tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals( 2, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals( 4, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); + assertEquals(2, tester.deployer.redeployments); + assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); + assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); // the last host goes down Node lastNode = tester.highestIndex(tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1)); @@ -286,23 +253,19 @@ public class NodeFailerTest { for (int minutes = 0; minutes < 75; minutes +=5 ) { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); - assertEquals( 2, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals( 4, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); + assertEquals(2, tester.deployer.redeployments); + assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); + assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); } // A new node is available tester.createReadyNodes(1, 16, NodeFailTester.nodeResources); tester.clock.advance(Duration.ofDays(1)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); // The node is now failed - assertEquals( 3, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals( 5, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); + assertEquals(3, tester.deployer.redeployments); + assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); + assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertTrue("The index of the last failed node is not reused", tester.highestIndex(tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1)).allocation().get().membership().index() > @@ -319,12 +282,10 @@ public class NodeFailerTest { String downNode = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); tester.serviceMonitor.setHostDown(downNode); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); tester.clock.advance(Duration.ofMinutes(75)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state()); @@ -332,12 +293,10 @@ public class NodeFailerTest { // Re-activate the node. It is still down, but should not be failed out until the grace period has passed again tester.nodeRepository.nodes().reactivate(downNode, Agent.system, getClass().getSimpleName()); tester.clock.advance(Duration.ofMinutes(30)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); tester.clock.advance(Duration.ofMinutes(45)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state()); @@ -360,7 +319,6 @@ public class NodeFailerTest { for (int minutes = 0; minutes < 45; minutes +=5 ) { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); assertEquals(0, tester.deployer.redeployments); assertEquals(3, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); @@ -368,7 +326,6 @@ public class NodeFailerTest { // downHost should now be failed and replaced tester.clock.advance(Duration.ofDays(1)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(1, tester.deployer.redeployments); assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); @@ -403,37 +360,15 @@ public class NodeFailerTest { } @Test - public void host_not_failed_without_config_requests() { - NodeFailTester tester = NodeFailTester.withTwoApplications(); - - // For a day all nodes work so nothing happens - for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(); - tester.runMaintainers(); - assertEquals( 3, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.host).size()); - assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size()); - } - - tester.clock.advance(Duration.ofMinutes(180)); - Node host = tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.host).first().get(); - tester.allNodesMakeAConfigRequestExcept(host); - tester.runMaintainers(); - assertEquals( 3, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.host).size()); - assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size()); - } - - @Test public void failing_hosts() { NodeFailTester tester = NodeFailTester.withTwoApplications(7); // For a day all nodes work so nothing happens for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals(13, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); + assertEquals(0, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); } @@ -446,21 +381,17 @@ public class NodeFailerTest { for (int minutes = 0; minutes < 45; minutes += 5 ) { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); assertEquals(0, tester.deployer.redeployments); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals(13, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); } tester.clock.advance(Duration.ofMinutes(30)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(2, tester.deployer.redeployments); - assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); + assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size()); @@ -468,9 +399,8 @@ public class NodeFailerTest { tester.runMaintainers(); assertEquals(2 + 1, tester.deployer.redeployments); - assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); + assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(6, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size()); @@ -483,18 +413,15 @@ public class NodeFailerTest { for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(); - assertEquals(3 + 1, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(2 + 1, tester.nodeRepository.nodes().list(Node.State.failed).size()); } tester.clock.advance(Duration.ofMinutes(30)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(3 + 1, tester.deployer.redeployments); - assertEquals(4, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); + assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals(9, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(6, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); @@ -503,14 +430,12 @@ public class NodeFailerTest { tester.serviceMonitor.setHostDown(downHost2); tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(90)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); tester.runMaintainers(); // The host is failed in the 2. maintain() assertEquals(5 + 2, tester.deployer.redeployments); - assertEquals(7, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); + assertEquals(5, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals(6, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(5, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); @@ -520,13 +445,11 @@ public class NodeFailerTest { tester.serviceMonitor.setHostDown(downHost3); tester.runMaintainers(); tester.clock.advance(Duration.ofDays(1)); - tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(6 + 2, tester.deployer.redeployments); - assertEquals(9, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); + assertEquals(6, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals(4, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(5, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); } @@ -547,7 +470,6 @@ public class NodeFailerTest { for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); assertEquals(count, tester.nodeRepository.nodes().list(Node.State.active).nodeType(nodeType).size()); } @@ -560,7 +482,6 @@ public class NodeFailerTest { for (int minutes = 0; minutes < 45; minutes +=5 ) { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); assertEquals( 0, tester.deployer.redeployments); assertEquals(count, tester.nodeRepository.nodes().list(Node.State.active).nodeType(nodeType).size()); } @@ -582,38 +503,30 @@ public class NodeFailerTest { } @Test - public void failing_divergent_ready_nodes() { - NodeFailTester tester = NodeFailTester.withNoApplications(); - - Node readyNode = tester.createReadyNodes(1).get(0); - - tester.runMaintainers(); - assertEquals(Node.State.ready, readyNode.state()); - - tester.nodeRepository.nodes().write(readyNode.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); - - tester.runMaintainers(); - assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).size()); - } - - @Test public void node_failing_throttle() { // Throttles based on an absolute number in small zone { - // 10 hosts with 3 tenant nodes each, total 40 nodes - NodeFailTester tester = NodeFailTester.withTwoApplications(10); - NodeList hosts = tester.nodeRepository.nodes().list().nodeType(NodeType.host); + // 10 hosts with 7 container and 7 content nodes, total 24 nodes + NodeFailTester tester = NodeFailTester.withTwoApplications(10, 7, 7); + + List<String> failedHostHostnames = tester.nodeRepository.nodes().list().stream() + .flatMap(node -> node.parentHostname().stream()) + .collect(Collectors.groupingBy(h -> h, Collectors.counting())) + .entrySet().stream() + .sorted(Comparator.comparingLong((Map.Entry<String, Long> e) -> e.getValue()).reversed()) + .limit(3) + .map(Map.Entry::getKey) + .toList(); // 3 hosts fail. 2 of them and all of their children are allowed to fail - List<Node> failedHosts = hosts.asList().subList(0, 3); - failedHosts.forEach(host -> tester.serviceMonitor.setHostDown(host.hostname())); + failedHostHostnames.forEach(hostname -> tester.serviceMonitor.setHostDown(hostname)); tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(61)); tester.runMaintainers(); tester.runMaintainers(); // hosts are typically failed in the 2. maintain() assertEquals(2 + /* hosts */ - (2 * 3) /* containers per host */, + (2 * 2) /* containers per host */, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric)); @@ -623,7 +536,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); } tester.runMaintainers(); - assertEquals(8, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(6, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric)); @@ -631,14 +544,14 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(30)); tester.runMaintainers(); tester.runMaintainers(); // hosts are failed in the 2. maintain() - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(9, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // Nothing else to fail tester.clock.advance(Duration.ofHours(25)); tester.runMaintainers(); - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(9, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index 03ab18c15d9..faf3f60e8af 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -74,6 +74,8 @@ import static org.junit.Assert.assertTrue; */ public class ProvisioningTester { + public static final ApplicationId tenantHostApp = ApplicationId.from("hosted-vespa", "tenant-host", "default"); + private final Curator curator; private final NodeFlavors nodeFlavors; private final ManualClock clock; @@ -543,7 +545,7 @@ public class ProvisioningTester { } public void activateTenantHosts() { - prepareAndActivateInfraApplication(applicationId(), NodeType.host); + prepareAndActivateInfraApplication(tenantHostApp, NodeType.host); } public static ClusterSpec containerClusterSpec() { |