diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-11-12 16:46:58 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-11-12 16:46:58 +0100 |
commit | ccbb05c22ac610c2f6cb59024edcc3c90586b1a7 (patch) | |
tree | e2113f9fd6e697d22cda077c604e29336fa8d2bf | |
parent | 152d8f57d0d62506c83894907ca3c2e1fc56dfd6 (diff) |
Separate node failure maintenance from failing
11 files changed, 239 insertions, 177 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java index 44ef653c2c8..0d291639a29 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java @@ -41,6 +41,7 @@ public class NodeHistory { FailedExpirer, InactiveExpirer, NodeFailer, + NodeFailureStatusUpdater, ProvisionedExpirer, Rebalancer, ReservationExpirer, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 59b8fd7785b..b2fc875a17f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -187,7 +187,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { metric.set("someServicesDown", (numberOfServicesDown > 0 ? 1 : 0), context); - boolean badNode = NodeFailer.badNode(services); + boolean badNode = NodeFailureStatusUpdater.badNode(services); metric.set("nodeFailerBadNode", (badNode ? 1 : 0), context); boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index a651c4e52c2..fa9ed9bed26 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -1,8 +1,6 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; -import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.ApplicationLockException; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostLivenessTracker; @@ -11,8 +9,6 @@ import com.yahoo.config.provision.TransientException; import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.HostName; -import com.yahoo.vespa.applicationmodel.ServiceInstance; -import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -60,12 +56,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { /** Metric that indicates whether throttling is active where 1 means active and 0 means inactive */ static final String throttlingActiveMetric = "nodeFailThrottling"; - /** Provides information about the status of ready hosts */ - private final HostLivenessTracker hostLivenessTracker; - - /** Provides (more accurate) information about the status of active hosts */ - private final ServiceMonitor serviceMonitor; - private final Deployer deployer; private final Duration downTimeLimit; private final Clock clock; @@ -74,15 +64,12 @@ public class NodeFailer extends NodeRepositoryMaintainer { private final ThrottlePolicy throttlePolicy; private final Metric metric; - public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker, - ServiceMonitor serviceMonitor, NodeRepository nodeRepository, + public NodeFailer(Deployer deployer, NodeRepository nodeRepository, Duration downTimeLimit, Duration interval, Clock clock, Orchestrator orchestrator, ThrottlePolicy throttlePolicy, Metric metric) { // check ping status every interval, but at least twice as often as the down time limit super(nodeRepository, min(downTimeLimit.dividedBy(2), interval), metric); this.deployer = deployer; - this.hostLivenessTracker = hostLivenessTracker; - this.serviceMonitor = serviceMonitor; this.downTimeLimit = downTimeLimit; this.clock = clock; this.orchestrator = orchestrator; @@ -98,8 +85,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { // Ready nodes try (Mutex lock = nodeRepository().lockUnallocated()) { - updateNodeLivenessEventsForReadyNodes(lock); - for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) { Node node = entry.getKey(); if (throttle(node)) { @@ -112,11 +97,8 @@ public class NodeFailer extends NodeRepositoryMaintainer { } } - updateNodeDownState(); - List<Node> activeNodes = nodeRepository().getNodes(Node.State.active); - - // Fail active nodes - for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason(activeNodes).entrySet()) { + // Active nodes + for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) { Node node = entry.getKey(); if (!failAllowedFor(node.type())) continue; @@ -139,21 +121,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { return throttlingActive == 0; } - private void updateNodeLivenessEventsForReadyNodes(Mutex lock) { - // Update node last request events through ZooKeeper to collect request to all config servers. - // We do this here ("lazily") to avoid writing to zk for each config request. - for (Node node : nodeRepository().getNodes(Node.State.ready)) { - Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname()); - if (lastLocalRequest.isEmpty()) continue; - - if (! node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) { - History updatedHistory = node.history() - .with(new History.Event(History.Event.Type.requested, Agent.NodeFailer, lastLocalRequest.get())); - nodeRepository().write(node.with(updatedHistory), lock); - } - } - } - private Map<Node, String> getReadyNodesByFailureReason() { Instant oldestAcceptableRequestTime = // Allow requests some time to be registered in case all config servers have been down @@ -183,38 +150,8 @@ public class NodeFailer extends NodeRepositoryMaintainer { return nodesByFailureReason; } - /** - * If the node is down (see {@link #badNode}), and there is no "down" history record, we add it. - * Otherwise we remove any "down" history record. - */ - private void updateNodeDownState() { - NodeList activeNodes = NodeList.copyOf(nodeRepository().getNodes(Node.State.active)); - serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { - Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first(); - if (node.isEmpty()) return; - - // Already correct record, nothing to do - boolean badNode = badNode(serviceInstances); - if (badNode == node.get().history().event(History.Event.Type.down).isPresent()) return; - - // Lock and update status - ApplicationId owner = node.get().allocation().get().owner(); - try (var lock = nodeRepository().lock(owner)) { - node = getNode(hostname.toString(), owner, lock); // Re-get inside lock - if (node.isEmpty()) return; // Node disappeared or changed allocation - if (badNode) { - recordAsDown(node.get(), lock); - } else { - clearDownRecord(node.get(), lock); - } - } catch (ApplicationLockException e) { - // Fine, carry on with other nodes. We'll try updating this one in the next run - log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e)); - } - }); - } - - private Map<Node, String> getActiveNodesByFailureReason(List<Node> activeNodes) { + private Map<Node, String> getActiveNodesByFailureReason() { + List<Node> activeNodes = nodeRepository().getNodes(Node.State.active); Instant graceTimeEnd = clock.instant().minus(downTimeLimit); Map<Node, String> nodesByFailureReason = new HashMap<>(); for (Node node : activeNodes) { @@ -252,13 +189,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { return reasonsToFailParentHost(hostNode).size() > 0; } - /** Get node by given hostname and application. The applicationLock must be held when calling this */ - private Optional<Node> getNode(String hostname, ApplicationId application, @SuppressWarnings("unused") Mutex applicationLock) { - return nodeRepository().getNode(hostname, Node.State.active) - .filter(node -> node.allocation().isPresent()) - .filter(node -> node.allocation().get().owner().equals(application)); - } - private boolean expectConfigRequests(Node node) { return !node.type().isHost(); } @@ -324,30 +254,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { } /** - * Returns true if the node is considered bad: All monitored services services are down. - * If a node remains bad for a long time, the NodeFailer will try to fail the node. - */ - static boolean badNode(List<ServiceInstance> services) { - Map<ServiceStatus, Long> countsByStatus = services.stream() - .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting())); - - return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L && - countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L; - } - - /** Record a node as down if not already recorded */ - private void recordAsDown(Node node, Mutex lock) { - if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp - nodeRepository().write(node.downAt(clock.instant(), Agent.NodeFailer), lock); - } - - /** Clear down record for node, if any */ - private void clearDownRecord(Node node, Mutex lock) { - if (node.history().event(History.Event.Type.down).isEmpty()) return; - nodeRepository().write(node.up(), lock); - } - - /** * Called when a node should be moved to the failed state: Do that if it seems safe, * which is when the node repo has available capacity to replace the node (and all its tenant nodes if host). * Otherwise not replacing the node ensures (by Orchestrator check) that no further action will be taken. @@ -379,7 +285,8 @@ public class NodeFailer extends NodeRepositoryMaintainer { return true; } catch (TransientException e) { log.log(Level.INFO, "Failed to redeploy " + node.allocation().get().owner() + - " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e)); + " with a transient error, will be retried by application maintainer: " + + Exceptions.toMessageString(e)); return true; } catch (RuntimeException e) { // The expected reason for deployment to fail here is that there is no capacity available to redeploy. diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java new file mode 100644 index 00000000000..e143f70c7c3 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java @@ -0,0 +1,137 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ApplicationLockException; +import com.yahoo.config.provision.HostLivenessTracker; +import com.yahoo.jdisc.Metric; +import com.yahoo.transaction.Mutex; +import com.yahoo.vespa.applicationmodel.ServiceInstance; +import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.History; +import com.yahoo.vespa.service.monitor.ServiceMonitor; +import com.yahoo.yolean.Exceptions; + +import java.time.Clock; +import java.time.Duration; +import java.time.Instant; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.logging.Level; +import java.util.stream.Collectors; + +import static java.util.stream.Collectors.counting; + +/** + * Checks if nodes are responding and updates their status accordingly + * + * @author bratseth + */ +public class NodeFailureStatusUpdater extends NodeRepositoryMaintainer { + + /** Provides information about the status of ready hosts */ + private final HostLivenessTracker hostLivenessTracker; + + /** Provides (more accurate) information about the status of active hosts */ + private final ServiceMonitor serviceMonitor; + + public NodeFailureStatusUpdater(HostLivenessTracker hostLivenessTracker, + ServiceMonitor serviceMonitor, NodeRepository nodeRepository, + Duration interval, Metric metric) { + super(nodeRepository, interval, metric); + this.hostLivenessTracker = hostLivenessTracker; + this.serviceMonitor = serviceMonitor; + } + + @Override + protected boolean maintain() { + updateReadyNodeLivenessEvents(); + updateActiveNodeDownState(); + return true; + } + + private void updateReadyNodeLivenessEvents() { + // Update node last request events through ZooKeeper to collect request to all config servers. + // We do this here ("lazily") to avoid writing to zk for each config request. + try (Mutex lock = nodeRepository().lockUnallocated()) { + for (Node node : nodeRepository().getNodes(Node.State.ready)) { + Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname()); + if (lastLocalRequest.isEmpty()) continue; + + if (!node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) { + History updatedHistory = node.history() + .with(new History.Event(History.Event.Type.requested, Agent.NodeFailureStatusUpdater, lastLocalRequest.get())); + nodeRepository().write(node.with(updatedHistory), lock); + } + } + } + } + + /** + * If the node is down (see {@link #badNode}), and there is no "down" history record, we add it. + * Otherwise we remove any "down" history record. + */ + private void updateActiveNodeDownState() { + NodeList activeNodes = NodeList.copyOf(nodeRepository().getNodes(Node.State.active)); + serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { + Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first(); + if (node.isEmpty()) return; + + // Already correct record, nothing to do + boolean badNode = badNode(serviceInstances); + if (badNode == node.get().history().event(History.Event.Type.down).isPresent()) return; + + // Lock and update status + ApplicationId owner = node.get().allocation().get().owner(); + try (var lock = nodeRepository().lock(owner)) { + node = getNode(hostname.toString(), owner, lock); // Re-get inside lock + if (node.isEmpty()) return; // Node disappeared or changed allocation + if (badNode) { + recordAsDown(node.get(), lock); + } else { + clearDownRecord(node.get(), lock); + } + } catch (ApplicationLockException e) { + // Fine, carry on with other nodes. We'll try updating this one in the next run + log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e)); + } + }); + } + + /** + * Returns true if the node is considered bad: All monitored services services are down. + * If a node remains bad for a long time, the NodeFailer will try to fail the node. + */ + static boolean badNode(List<ServiceInstance> services) { + Map<ServiceStatus, Long> countsByStatus = services.stream() + .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting())); + + return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L && + countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L; + } + + /** Get node by given hostname and application. The applicationLock must be held when calling this */ + private Optional<Node> getNode(String hostname, ApplicationId application, @SuppressWarnings("unused") Mutex applicationLock) { + return nodeRepository().getNode(hostname, Node.State.active) + .filter(node -> node.allocation().isPresent()) + .filter(node -> node.allocation().get().owner().equals(application)); + } + + /** Record a node as down if not already recorded */ + private void recordAsDown(Node node, Mutex lock) { + if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp + nodeRepository().write(node.downAt(clock().instant(), Agent.NodeFailureStatusUpdater), lock); + } + + /** Clear down record for node, if any */ + private void clearDownRecord(Node node, Mutex lock) { + if (node.history().event(History.Event.Type.down).isEmpty()) return; + nodeRepository().write(node.up(), lock); + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index 2e9cf783f79..84dd2c6a8c3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -9,6 +9,7 @@ import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import java.time.Clock; import java.time.Duration; import java.util.List; import java.util.Map; @@ -36,6 +37,9 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { /** Returns the node repository */ protected NodeRepository nodeRepository() { return nodeRepository; } + /** Returns the node repository clock */ + protected Clock clock() { return nodeRepository.clock(); } + /** A utility to group active tenant nodes by application */ protected Map<ApplicationId, List<Node>> activeNodesByApplication() { return nodeRepository().list(Node.State.active) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 5e3584bfcd0..c14cd18425d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -29,6 +29,7 @@ import java.util.Optional; public class NodeRepositoryMaintenance extends AbstractComponent { private final NodeFailer nodeFailer; + private final NodeFailureStatusUpdater nodeFailureStatusUpdater; private final PeriodicApplicationMaintainer periodicApplicationMaintainer; private final OperatorChangeApplicationMaintainer operatorChangeApplicationMaintainer; private final ReservationExpirer reservationExpirer; @@ -68,8 +69,8 @@ public class NodeRepositoryMaintenance extends AbstractComponent { MetricsFetcher metricsFetcher, MetricsDb metricsDb) { DefaultTimes defaults = new DefaultTimes(zone, deployer); - nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, defaults.failGrace, - defaults.nodeFailerInterval, clock, orchestrator, defaults.throttlePolicy, metric); + nodeFailer = new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, clock, orchestrator, defaults.throttlePolicy, metric); + nodeFailureStatusUpdater = new NodeFailureStatusUpdater(hostLivenessTracker, serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric); periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, metric, nodeRepository, defaults.redeployMaintainerInterval, defaults.periodicRedeployInterval, flagSource); operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.operatorChangeRedeployInterval); @@ -101,6 +102,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { @Override public void deconstruct() { nodeFailer.close(); + nodeFailureStatusUpdater.close(); periodicApplicationMaintainer.close(); operatorChangeApplicationMaintainer.close(); reservationExpirer.close(); @@ -144,6 +146,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration spareCapacityMaintenanceInterval; private final Duration metricsInterval; private final Duration nodeFailerInterval; + private final Duration nodeFailureStatusUpdateInterval; private final Duration retiredInterval; private final Duration infrastructureProvisionInterval; private final Duration loadBalancerExpirerInterval; @@ -166,6 +169,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { loadBalancerExpirerInterval = Duration.ofMinutes(5); metricsInterval = Duration.ofMinutes(1); nodeFailerInterval = Duration.ofMinutes(15); + nodeFailureStatusUpdateInterval = Duration.ofMinutes(2); nodeMetricsCollectionInterval = Duration.ofMinutes(1); operatorChangeRedeployInterval = Duration.ofMinutes(3); // Vespa upgrade frequency is higher in CD so (de)activate OS upgrades more frequently as well diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java index b82c99ac26e..e32b5401824 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java @@ -14,6 +14,7 @@ public enum Agent { // Specific system agents: NodeFailer, + NodeFailureStatusUpdater, Rebalancer, DirtyExpirer, FailedExpirer, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index c555d0281a5..97f7807f689 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -438,6 +438,7 @@ public class NodeSerializer { case "FailedExpirer" : return Agent.FailedExpirer; case "InactiveExpirer" : return Agent.InactiveExpirer; case "NodeFailer" : return Agent.NodeFailer; + case "NodeFailureStatusUpdater" : return Agent.NodeFailureStatusUpdater; case "ProvisionedExpirer" : return Agent.ProvisionedExpirer; case "Rebalancer" : return Agent.Rebalancer; case "ReservationExpirer" : return Agent.ReservationExpirer; @@ -457,6 +458,7 @@ public class NodeSerializer { case FailedExpirer : return "FailedExpirer"; case InactiveExpirer : return "InactiveExpirer"; case NodeFailer : return "NodeFailer"; + case NodeFailureStatusUpdater : return "NodeFailureStatusUpdater"; case ProvisionedExpirer : return "ProvisionedExpirer"; case Rebalancer : return "Rebalancer"; case ReservationExpirer : return "ReservationExpirer"; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 5b96d0055b8..d759ceba4f6 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -59,6 +59,7 @@ public class NodeFailTester { public final NodeRepository nodeRepository; public final ProvisioningTester tester; public NodeFailer failer; + public NodeFailureStatusUpdater updater; public ServiceMonitorStub serviceMonitor; public MockDeployer deployer; public TestMetric metric; @@ -79,6 +80,14 @@ public class NodeFailTester { hostLivenessTracker = new TestHostLivenessTracker(clock); } + private void initializeMaintainers(Map<ApplicationId, MockDeployer.ApplicationContext> apps) { + deployer = new MockDeployer(provisioner, tester.clock(), apps); + serviceMonitor = new ServiceMonitorStub(apps, nodeRepository); + metric = new TestMetric(); + failer = createFailer(); + updater = createUpdater(); + } + public static NodeFailTester withTwoApplications() { NodeFailTester tester = new NodeFailTester(); @@ -99,10 +108,7 @@ public class NodeFailTester { Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of( app1, new MockDeployer.ApplicationContext(app1, clusterApp1, capacity1), app2, new MockDeployer.ApplicationContext(app2, clusterApp2, capacity2)); - tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps); - tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository); - tester.metric = new TestMetric(); - tester.failer = tester.createFailer(); + tester.initializeMaintainers(apps); return tester; } @@ -135,10 +141,7 @@ public class NodeFailTester { tenantHostApp, new MockDeployer.ApplicationContext(tenantHostApp, clusterNodeAdminApp, allHosts), app1, new MockDeployer.ApplicationContext(app1, clusterApp1, capacity1), app2, new MockDeployer.ApplicationContext(app2, clusterApp2, capacity2)); - tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps); - tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository); - tester.metric = new TestMetric(); - tester.failer = tester.createFailer(); + tester.initializeMaintainers(apps); return tester; } @@ -148,10 +151,7 @@ public class NodeFailTester { // Create applications ClusterSpec clusterApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("test")).vespaVersion("6.42").build(); Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, clusterApp, capacity)); - tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps); - tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository); - tester.metric = new TestMetric(); - tester.failer = tester.createFailer(); + tester.initializeMaintainers(apps); return tester; } @@ -167,22 +167,21 @@ public class NodeFailTester { Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of( app1, new MockDeployer.ApplicationContext(app1, clusterApp1, allNodes)); - tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps); - tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository); - tester.metric = new TestMetric(); - tester.failer = tester.createFailer(); + tester.initializeMaintainers(apps); return tester; } public static NodeFailTester withNoApplications() { NodeFailTester tester = new NodeFailTester(); - tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), Map.of()); - tester.serviceMonitor = new ServiceMonitorStub(Map.of(), tester.nodeRepository); - tester.metric = new TestMetric(); - tester.failer = tester.createFailer(); + tester.initializeMaintainers(Map.of()); return tester; } + public void runMaintainers() { + updater.maintain(); + failer.maintain(); + } + public void suspend(ApplicationId app) { try { orchestrator.suspend(app); @@ -200,10 +199,14 @@ public class NodeFailTester { } public NodeFailer createFailer() { - return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, downtimeLimitOneHour, + return new NodeFailer(deployer, nodeRepository, downtimeLimitOneHour, Duration.ofMinutes(5), clock, orchestrator, NodeFailer.ThrottlePolicy.hosted, metric); } + public NodeFailureStatusUpdater createUpdater() { + return new NodeFailureStatusUpdater(hostLivenessTracker, serviceMonitor, nodeRepository, Duration.ofMinutes(5), metric); + } + public void allNodesMakeAConfigRequestExcept(Node ... deadNodeArray) { allNodesMakeAConfigRequestExcept(List.of(deadNodeArray)); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index cd2acebf04f..dcb70eed099 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -70,10 +70,10 @@ public class NodeFailerTest { // Suspend the first of the active nodes tester.suspend(hostnamesByState.get(Node.State.active).get(0)); - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); // The first (and the only) ready node and the 1st active node that was allowed to fail should be failed Map<Node.State, List<String>> expectedHostnamesByState1Iter = Map.of( @@ -88,7 +88,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); // All of the children should be failed now Set<Node.State> childStates2Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() @@ -98,7 +98,7 @@ public class NodeFailerTest { assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithHwFailure).get().state()); tester.suspend(hostWithHwFailure); - tester.failer.run(); + tester.runMaintainers(); assertEquals(Node.State.failed, tester.nodeRepository.getNode(hostWithHwFailure).get().state()); assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size()); } @@ -129,7 +129,7 @@ public class NodeFailerTest { // The ready node will be failed, but neither the host nor the 2 active nodes since they have not been suspended tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state()); @@ -139,7 +139,7 @@ public class NodeFailerTest { tester.suspend(hostWithFailureReports); tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state()); @@ -149,7 +149,7 @@ public class NodeFailerTest { tester.suspend(activeChild1); tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state()); @@ -159,7 +159,7 @@ public class NodeFailerTest { tester.suspend(activeChild2); tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state()); @@ -176,9 +176,9 @@ public class NodeFailerTest { String host_from_normal_app = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname(); tester.serviceMonitor.setHostDown(host_from_suspended_app); tester.serviceMonitor.setHostDown(host_from_normal_app); - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(65)); - tester.failer.run(); + tester.runMaintainers(); assertEquals(Node.State.failed, tester.nodeRepository.getNode(host_from_normal_app).get().state()); assertEquals(Node.State.active, tester.nodeRepository.getNode(host_from_suspended_app).get().state()); @@ -190,7 +190,7 @@ public class NodeFailerTest { // For a day all nodes work so nothing happens for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); @@ -206,7 +206,7 @@ public class NodeFailerTest { tester.nodeRepository.write(readyFail1.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); tester.nodeRepository.write(readyFail2.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - tester.failer.run(); + tester.runMaintainers(); assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state()); @@ -217,7 +217,7 @@ public class NodeFailerTest { tester.serviceMonitor.setHostDown(downHost2); // nothing happens the first 45 minutes for (int minutes = 0; minutes < 45; minutes +=5 ) { - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals( 0, tester.deployer.redeployments); @@ -230,7 +230,7 @@ public class NodeFailerTest { // downHost2 should now be failed and replaced, but not downHost1 tester.clock.advance(Duration.ofDays(1)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals( 1, tester.deployer.redeployments); assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); @@ -239,17 +239,17 @@ public class NodeFailerTest { // downHost1 fails again tester.serviceMonitor.setHostDown(downHost1); - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); // the system goes down tester.clock.advance(Duration.ofMinutes(120)); tester.failer = tester.createFailer(); - tester.failer.run(); + tester.runMaintainers(); // the host is still down and fails tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals( 2, tester.deployer.redeployments); assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); @@ -260,7 +260,7 @@ public class NodeFailerTest { tester.serviceMonitor.setHostDown(lastNode.hostname()); // it is not failed because there are no ready nodes to replace it for (int minutes = 0; minutes < 75; minutes +=5 ) { - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals( 2, tester.deployer.redeployments); @@ -273,7 +273,7 @@ public class NodeFailerTest { tester.createReadyNodes(1, 16, NodeFailTester.nodeResources); tester.clock.advance(Duration.ofDays(1)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); // The node is now failed assertEquals( 3, tester.deployer.redeployments); assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); @@ -300,7 +300,7 @@ public class NodeFailerTest { // nothing happens the first 45 minutes for (int minutes = 0; minutes < 45; minutes +=5 ) { - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals(0, tester.deployer.redeployments); @@ -311,7 +311,7 @@ public class NodeFailerTest { // downHost should now be failed and replaced tester.clock.advance(Duration.ofDays(1)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(1, tester.deployer.redeployments); assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); @@ -330,7 +330,7 @@ public class NodeFailerTest { for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); } @@ -343,14 +343,14 @@ public class NodeFailerTest { .filter(node -> ! node.resources().equals(newNodeResources)) .collect(Collectors.toList()); tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode); - tester.failer.run(); + tester.runMaintainers(); assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); // Another ready node dies and the node that died earlier, are allowed to fail tester.clock.advance(Duration.ofDays(1)); tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode, otherNodes.get(3)); - tester.failer.run(); + tester.runMaintainers(); assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(otherNodes.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0)); assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); @@ -364,7 +364,7 @@ public class NodeFailerTest { for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size()); } @@ -373,7 +373,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(180)); Node dockerHost = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).iterator().next(); tester.allNodesMakeAConfigRequestExcept(dockerHost); - tester.failer.run(); + tester.runMaintainers(); assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size()); } @@ -386,7 +386,7 @@ public class NodeFailerTest { for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); @@ -399,7 +399,7 @@ public class NodeFailerTest { // nothing happens the first 45 minutes for (int minutes = 0; minutes < 45; minutes += 5 ) { - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals(0, tester.deployer.redeployments); @@ -410,7 +410,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(30)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(2 + 1, tester.deployer.redeployments); assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); @@ -425,7 +425,7 @@ public class NodeFailerTest { // nothing happens during the entire day because of the failure throttling for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size()); @@ -433,7 +433,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(30)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(3 + 1, tester.deployer.redeployments); assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); @@ -445,10 +445,10 @@ public class NodeFailerTest { // Lets fail another host, make sure it is not the same where downTenant1 is a child String downHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get()); tester.serviceMonitor.setHostDown(downHost2); - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(90)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(5 + 2, tester.deployer.redeployments); assertEquals(7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); @@ -460,10 +460,10 @@ public class NodeFailerTest { // node, while app2's should remain String downHost3 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get()); tester.serviceMonitor.setHostDown(downHost3); - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofDays(1)); tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); + tester.runMaintainers(); assertEquals(6 + 2, tester.deployer.redeployments); assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); @@ -487,7 +487,7 @@ public class NodeFailerTest { // For a day all nodes work so nothing happens for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); @@ -500,7 +500,7 @@ public class NodeFailerTest { tester.serviceMonitor.setHostDown(downHost); // nothing happens the first 45 minutes for (int minutes = 0; minutes < 45; minutes +=5 ) { - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals( 0, tester.deployer.redeployments); @@ -508,7 +508,7 @@ public class NodeFailerTest { } tester.clock.advance(Duration.ofMinutes(60)); - tester.failer.run(); + tester.runMaintainers(); // one down host should now be failed, but not two as we are only allowed to fail one proxy assertEquals(expectedFailCount, tester.deployer.redeployments); @@ -519,7 +519,7 @@ public class NodeFailerTest { // trying to fail again will still not fail the other down host tester.clock.advance(Duration.ofMinutes(60)); - tester.failer.run(); + tester.runMaintainers(); assertEquals(count - expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); } @@ -529,12 +529,12 @@ public class NodeFailerTest { Node readyNode = tester.createReadyNodes(1).get(0); - tester.failer.run(); + tester.runMaintainers(); assertEquals(Node.State.ready, readyNode.state()); tester.nodeRepository.write(readyNode.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); - tester.failer.run(); + tester.runMaintainers(); assertEquals(1, tester.nodeRepository.getNodes(Node.State.failed).size()); } @@ -555,7 +555,7 @@ public class NodeFailerTest { } // 2 nodes are failed (the minimum amount that are always allowed to fail) - tester.failer.run(); + tester.runMaintainers(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -565,7 +565,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } - tester.failer.run(); + tester.runMaintainers(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -575,7 +575,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } - tester.failer.run(); + tester.runMaintainers(); assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size()); // 24 more hours pass, nothing happens @@ -590,11 +590,11 @@ public class NodeFailerTest { tester.serviceMonitor.setHostDown(host.hostname()); deadNodes.add(host); }); - tester.failer.run(); + tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(61)); tester.allNodesMakeAConfigRequestExcept(deadNodes); - tester.failer.run(); + tester.runMaintainers(); assertEquals(4 + /* already failed */ 2 + /* hosts */ (2 * 3) /* containers per host */, @@ -607,14 +607,14 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } - tester.failer.run(); + tester.runMaintainers(); assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric)); // The final host and its containers are failed out tester.clock.advance(Duration.ofMinutes(30)); - tester.failer.run(); + tester.runMaintainers(); assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -622,7 +622,7 @@ public class NodeFailerTest { // Nothing else to fail tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(deadNodes); - tester.failer.run(); + tester.runMaintainers(); assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -639,7 +639,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } - tester.failer.run(); + tester.runMaintainers(); // 2% are allowed to fail assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); @@ -650,7 +650,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } - tester.failer.run(); + tester.runMaintainers(); assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -660,7 +660,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } - tester.failer.run(); + tester.runMaintainers(); assertEquals(15, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -694,7 +694,7 @@ public class NodeFailerTest { addServiceInstances(services, ServiceStatus.NOT_CHECKED, numNotChecked); Collections.shuffle(services); - return NodeFailer.badNode(services); + return NodeFailureStatusUpdater.badNode(services); } /** diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json index 55d083c877d..0d9ca44a293 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json @@ -25,6 +25,9 @@ "name": "NodeFailer" }, { + "name": "NodeFailureStatusUpdater" + }, + { "name": "NodeMetricsDbMaintainer" }, { |