summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2020-11-12 16:46:58 +0100
committerJon Bratseth <bratseth@gmail.com>2020-11-12 16:46:58 +0100
commitccbb05c22ac610c2f6cb59024edcc3c90586b1a7 (patch)
treee2113f9fd6e697d22cda077c604e29336fa8d2bf
parent152d8f57d0d62506c83894907ca3c2e1fc56dfd6 (diff)
Separate node failure maintenance from failing
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java107
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java137
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java45
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java106
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json3
11 files changed, 239 insertions, 177 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java
index 44ef653c2c8..0d291639a29 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeHistory.java
@@ -41,6 +41,7 @@ public class NodeHistory {
FailedExpirer,
InactiveExpirer,
NodeFailer,
+ NodeFailureStatusUpdater,
ProvisionedExpirer,
Rebalancer,
ReservationExpirer,
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 59b8fd7785b..b2fc875a17f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -187,7 +187,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
metric.set("someServicesDown", (numberOfServicesDown > 0 ? 1 : 0), context);
- boolean badNode = NodeFailer.badNode(services);
+ boolean badNode = NodeFailureStatusUpdater.badNode(services);
metric.set("nodeFailerBadNode", (badNode ? 1 : 0), context);
boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent();
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index a651c4e52c2..fa9ed9bed26 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -1,8 +1,6 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
-import com.yahoo.config.provision.ApplicationId;
-import com.yahoo.config.provision.ApplicationLockException;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
@@ -11,8 +9,6 @@ import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.HostName;
-import com.yahoo.vespa.applicationmodel.ServiceInstance;
-import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
@@ -60,12 +56,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
/** Metric that indicates whether throttling is active where 1 means active and 0 means inactive */
static final String throttlingActiveMetric = "nodeFailThrottling";
- /** Provides information about the status of ready hosts */
- private final HostLivenessTracker hostLivenessTracker;
-
- /** Provides (more accurate) information about the status of active hosts */
- private final ServiceMonitor serviceMonitor;
-
private final Deployer deployer;
private final Duration downTimeLimit;
private final Clock clock;
@@ -74,15 +64,12 @@ public class NodeFailer extends NodeRepositoryMaintainer {
private final ThrottlePolicy throttlePolicy;
private final Metric metric;
- public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker,
- ServiceMonitor serviceMonitor, NodeRepository nodeRepository,
+ public NodeFailer(Deployer deployer, NodeRepository nodeRepository,
Duration downTimeLimit, Duration interval, Clock clock, Orchestrator orchestrator,
ThrottlePolicy throttlePolicy, Metric metric) {
// check ping status every interval, but at least twice as often as the down time limit
super(nodeRepository, min(downTimeLimit.dividedBy(2), interval), metric);
this.deployer = deployer;
- this.hostLivenessTracker = hostLivenessTracker;
- this.serviceMonitor = serviceMonitor;
this.downTimeLimit = downTimeLimit;
this.clock = clock;
this.orchestrator = orchestrator;
@@ -98,8 +85,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
// Ready nodes
try (Mutex lock = nodeRepository().lockUnallocated()) {
- updateNodeLivenessEventsForReadyNodes(lock);
-
for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) {
Node node = entry.getKey();
if (throttle(node)) {
@@ -112,11 +97,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
}
- updateNodeDownState();
- List<Node> activeNodes = nodeRepository().getNodes(Node.State.active);
-
- // Fail active nodes
- for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason(activeNodes).entrySet()) {
+ // Active nodes
+ for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) {
Node node = entry.getKey();
if (!failAllowedFor(node.type())) continue;
@@ -139,21 +121,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return throttlingActive == 0;
}
- private void updateNodeLivenessEventsForReadyNodes(Mutex lock) {
- // Update node last request events through ZooKeeper to collect request to all config servers.
- // We do this here ("lazily") to avoid writing to zk for each config request.
- for (Node node : nodeRepository().getNodes(Node.State.ready)) {
- Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname());
- if (lastLocalRequest.isEmpty()) continue;
-
- if (! node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) {
- History updatedHistory = node.history()
- .with(new History.Event(History.Event.Type.requested, Agent.NodeFailer, lastLocalRequest.get()));
- nodeRepository().write(node.with(updatedHistory), lock);
- }
- }
- }
-
private Map<Node, String> getReadyNodesByFailureReason() {
Instant oldestAcceptableRequestTime =
// Allow requests some time to be registered in case all config servers have been down
@@ -183,38 +150,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return nodesByFailureReason;
}
- /**
- * If the node is down (see {@link #badNode}), and there is no "down" history record, we add it.
- * Otherwise we remove any "down" history record.
- */
- private void updateNodeDownState() {
- NodeList activeNodes = NodeList.copyOf(nodeRepository().getNodes(Node.State.active));
- serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> {
- Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first();
- if (node.isEmpty()) return;
-
- // Already correct record, nothing to do
- boolean badNode = badNode(serviceInstances);
- if (badNode == node.get().history().event(History.Event.Type.down).isPresent()) return;
-
- // Lock and update status
- ApplicationId owner = node.get().allocation().get().owner();
- try (var lock = nodeRepository().lock(owner)) {
- node = getNode(hostname.toString(), owner, lock); // Re-get inside lock
- if (node.isEmpty()) return; // Node disappeared or changed allocation
- if (badNode) {
- recordAsDown(node.get(), lock);
- } else {
- clearDownRecord(node.get(), lock);
- }
- } catch (ApplicationLockException e) {
- // Fine, carry on with other nodes. We'll try updating this one in the next run
- log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e));
- }
- });
- }
-
- private Map<Node, String> getActiveNodesByFailureReason(List<Node> activeNodes) {
+ private Map<Node, String> getActiveNodesByFailureReason() {
+ List<Node> activeNodes = nodeRepository().getNodes(Node.State.active);
Instant graceTimeEnd = clock.instant().minus(downTimeLimit);
Map<Node, String> nodesByFailureReason = new HashMap<>();
for (Node node : activeNodes) {
@@ -252,13 +189,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return reasonsToFailParentHost(hostNode).size() > 0;
}
- /** Get node by given hostname and application. The applicationLock must be held when calling this */
- private Optional<Node> getNode(String hostname, ApplicationId application, @SuppressWarnings("unused") Mutex applicationLock) {
- return nodeRepository().getNode(hostname, Node.State.active)
- .filter(node -> node.allocation().isPresent())
- .filter(node -> node.allocation().get().owner().equals(application));
- }
-
private boolean expectConfigRequests(Node node) {
return !node.type().isHost();
}
@@ -324,30 +254,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
/**
- * Returns true if the node is considered bad: All monitored services services are down.
- * If a node remains bad for a long time, the NodeFailer will try to fail the node.
- */
- static boolean badNode(List<ServiceInstance> services) {
- Map<ServiceStatus, Long> countsByStatus = services.stream()
- .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting()));
-
- return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L &&
- countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L;
- }
-
- /** Record a node as down if not already recorded */
- private void recordAsDown(Node node, Mutex lock) {
- if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp
- nodeRepository().write(node.downAt(clock.instant(), Agent.NodeFailer), lock);
- }
-
- /** Clear down record for node, if any */
- private void clearDownRecord(Node node, Mutex lock) {
- if (node.history().event(History.Event.Type.down).isEmpty()) return;
- nodeRepository().write(node.up(), lock);
- }
-
- /**
* Called when a node should be moved to the failed state: Do that if it seems safe,
* which is when the node repo has available capacity to replace the node (and all its tenant nodes if host).
* Otherwise not replacing the node ensures (by Orchestrator check) that no further action will be taken.
@@ -379,7 +285,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return true;
} catch (TransientException e) {
log.log(Level.INFO, "Failed to redeploy " + node.allocation().get().owner() +
- " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e));
+ " with a transient error, will be retried by application maintainer: " +
+ Exceptions.toMessageString(e));
return true;
} catch (RuntimeException e) {
// The expected reason for deployment to fail here is that there is no capacity available to redeploy.
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java
new file mode 100644
index 00000000000..e143f70c7c3
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java
@@ -0,0 +1,137 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.provision.maintenance;
+
+import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.ApplicationLockException;
+import com.yahoo.config.provision.HostLivenessTracker;
+import com.yahoo.jdisc.Metric;
+import com.yahoo.transaction.Mutex;
+import com.yahoo.vespa.applicationmodel.ServiceInstance;
+import com.yahoo.vespa.applicationmodel.ServiceStatus;
+import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Agent;
+import com.yahoo.vespa.hosted.provision.node.History;
+import com.yahoo.vespa.service.monitor.ServiceMonitor;
+import com.yahoo.yolean.Exceptions;
+
+import java.time.Clock;
+import java.time.Duration;
+import java.time.Instant;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.logging.Level;
+import java.util.stream.Collectors;
+
+import static java.util.stream.Collectors.counting;
+
+/**
+ * Checks if nodes are responding and updates their status accordingly
+ *
+ * @author bratseth
+ */
+public class NodeFailureStatusUpdater extends NodeRepositoryMaintainer {
+
+ /** Provides information about the status of ready hosts */
+ private final HostLivenessTracker hostLivenessTracker;
+
+ /** Provides (more accurate) information about the status of active hosts */
+ private final ServiceMonitor serviceMonitor;
+
+ public NodeFailureStatusUpdater(HostLivenessTracker hostLivenessTracker,
+ ServiceMonitor serviceMonitor, NodeRepository nodeRepository,
+ Duration interval, Metric metric) {
+ super(nodeRepository, interval, metric);
+ this.hostLivenessTracker = hostLivenessTracker;
+ this.serviceMonitor = serviceMonitor;
+ }
+
+ @Override
+ protected boolean maintain() {
+ updateReadyNodeLivenessEvents();
+ updateActiveNodeDownState();
+ return true;
+ }
+
+ private void updateReadyNodeLivenessEvents() {
+ // Update node last request events through ZooKeeper to collect request to all config servers.
+ // We do this here ("lazily") to avoid writing to zk for each config request.
+ try (Mutex lock = nodeRepository().lockUnallocated()) {
+ for (Node node : nodeRepository().getNodes(Node.State.ready)) {
+ Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname());
+ if (lastLocalRequest.isEmpty()) continue;
+
+ if (!node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) {
+ History updatedHistory = node.history()
+ .with(new History.Event(History.Event.Type.requested, Agent.NodeFailureStatusUpdater, lastLocalRequest.get()));
+ nodeRepository().write(node.with(updatedHistory), lock);
+ }
+ }
+ }
+ }
+
+ /**
+ * If the node is down (see {@link #badNode}), and there is no "down" history record, we add it.
+ * Otherwise we remove any "down" history record.
+ */
+ private void updateActiveNodeDownState() {
+ NodeList activeNodes = NodeList.copyOf(nodeRepository().getNodes(Node.State.active));
+ serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> {
+ Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first();
+ if (node.isEmpty()) return;
+
+ // Already correct record, nothing to do
+ boolean badNode = badNode(serviceInstances);
+ if (badNode == node.get().history().event(History.Event.Type.down).isPresent()) return;
+
+ // Lock and update status
+ ApplicationId owner = node.get().allocation().get().owner();
+ try (var lock = nodeRepository().lock(owner)) {
+ node = getNode(hostname.toString(), owner, lock); // Re-get inside lock
+ if (node.isEmpty()) return; // Node disappeared or changed allocation
+ if (badNode) {
+ recordAsDown(node.get(), lock);
+ } else {
+ clearDownRecord(node.get(), lock);
+ }
+ } catch (ApplicationLockException e) {
+ // Fine, carry on with other nodes. We'll try updating this one in the next run
+ log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e));
+ }
+ });
+ }
+
+ /**
+ * Returns true if the node is considered bad: All monitored services services are down.
+ * If a node remains bad for a long time, the NodeFailer will try to fail the node.
+ */
+ static boolean badNode(List<ServiceInstance> services) {
+ Map<ServiceStatus, Long> countsByStatus = services.stream()
+ .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting()));
+
+ return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L &&
+ countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L;
+ }
+
+ /** Get node by given hostname and application. The applicationLock must be held when calling this */
+ private Optional<Node> getNode(String hostname, ApplicationId application, @SuppressWarnings("unused") Mutex applicationLock) {
+ return nodeRepository().getNode(hostname, Node.State.active)
+ .filter(node -> node.allocation().isPresent())
+ .filter(node -> node.allocation().get().owner().equals(application));
+ }
+
+ /** Record a node as down if not already recorded */
+ private void recordAsDown(Node node, Mutex lock) {
+ if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp
+ nodeRepository().write(node.downAt(clock().instant(), Agent.NodeFailureStatusUpdater), lock);
+ }
+
+ /** Clear down record for node, if any */
+ private void clearDownRecord(Node node, Mutex lock) {
+ if (node.history().event(History.Event.Type.down).isEmpty()) return;
+ nodeRepository().write(node.up(), lock);
+ }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
index 2e9cf783f79..84dd2c6a8c3 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
@@ -9,6 +9,7 @@ import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import java.time.Clock;
import java.time.Duration;
import java.util.List;
import java.util.Map;
@@ -36,6 +37,9 @@ public abstract class NodeRepositoryMaintainer extends Maintainer {
/** Returns the node repository */
protected NodeRepository nodeRepository() { return nodeRepository; }
+ /** Returns the node repository clock */
+ protected Clock clock() { return nodeRepository.clock(); }
+
/** A utility to group active tenant nodes by application */
protected Map<ApplicationId, List<Node>> activeNodesByApplication() {
return nodeRepository().list(Node.State.active)
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 5e3584bfcd0..c14cd18425d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -29,6 +29,7 @@ import java.util.Optional;
public class NodeRepositoryMaintenance extends AbstractComponent {
private final NodeFailer nodeFailer;
+ private final NodeFailureStatusUpdater nodeFailureStatusUpdater;
private final PeriodicApplicationMaintainer periodicApplicationMaintainer;
private final OperatorChangeApplicationMaintainer operatorChangeApplicationMaintainer;
private final ReservationExpirer reservationExpirer;
@@ -68,8 +69,8 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
MetricsFetcher metricsFetcher, MetricsDb metricsDb) {
DefaultTimes defaults = new DefaultTimes(zone, deployer);
- nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, defaults.failGrace,
- defaults.nodeFailerInterval, clock, orchestrator, defaults.throttlePolicy, metric);
+ nodeFailer = new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, clock, orchestrator, defaults.throttlePolicy, metric);
+ nodeFailureStatusUpdater = new NodeFailureStatusUpdater(hostLivenessTracker, serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric);
periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, metric, nodeRepository,
defaults.redeployMaintainerInterval, defaults.periodicRedeployInterval, flagSource);
operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.operatorChangeRedeployInterval);
@@ -101,6 +102,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
@Override
public void deconstruct() {
nodeFailer.close();
+ nodeFailureStatusUpdater.close();
periodicApplicationMaintainer.close();
operatorChangeApplicationMaintainer.close();
reservationExpirer.close();
@@ -144,6 +146,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
private final Duration spareCapacityMaintenanceInterval;
private final Duration metricsInterval;
private final Duration nodeFailerInterval;
+ private final Duration nodeFailureStatusUpdateInterval;
private final Duration retiredInterval;
private final Duration infrastructureProvisionInterval;
private final Duration loadBalancerExpirerInterval;
@@ -166,6 +169,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
loadBalancerExpirerInterval = Duration.ofMinutes(5);
metricsInterval = Duration.ofMinutes(1);
nodeFailerInterval = Duration.ofMinutes(15);
+ nodeFailureStatusUpdateInterval = Duration.ofMinutes(2);
nodeMetricsCollectionInterval = Duration.ofMinutes(1);
operatorChangeRedeployInterval = Duration.ofMinutes(3);
// Vespa upgrade frequency is higher in CD so (de)activate OS upgrades more frequently as well
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java
index b82c99ac26e..e32b5401824 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java
@@ -14,6 +14,7 @@ public enum Agent {
// Specific system agents:
NodeFailer,
+ NodeFailureStatusUpdater,
Rebalancer,
DirtyExpirer,
FailedExpirer,
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
index c555d0281a5..97f7807f689 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
@@ -438,6 +438,7 @@ public class NodeSerializer {
case "FailedExpirer" : return Agent.FailedExpirer;
case "InactiveExpirer" : return Agent.InactiveExpirer;
case "NodeFailer" : return Agent.NodeFailer;
+ case "NodeFailureStatusUpdater" : return Agent.NodeFailureStatusUpdater;
case "ProvisionedExpirer" : return Agent.ProvisionedExpirer;
case "Rebalancer" : return Agent.Rebalancer;
case "ReservationExpirer" : return Agent.ReservationExpirer;
@@ -457,6 +458,7 @@ public class NodeSerializer {
case FailedExpirer : return "FailedExpirer";
case InactiveExpirer : return "InactiveExpirer";
case NodeFailer : return "NodeFailer";
+ case NodeFailureStatusUpdater : return "NodeFailureStatusUpdater";
case ProvisionedExpirer : return "ProvisionedExpirer";
case Rebalancer : return "Rebalancer";
case ReservationExpirer : return "ReservationExpirer";
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 5b96d0055b8..d759ceba4f6 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -59,6 +59,7 @@ public class NodeFailTester {
public final NodeRepository nodeRepository;
public final ProvisioningTester tester;
public NodeFailer failer;
+ public NodeFailureStatusUpdater updater;
public ServiceMonitorStub serviceMonitor;
public MockDeployer deployer;
public TestMetric metric;
@@ -79,6 +80,14 @@ public class NodeFailTester {
hostLivenessTracker = new TestHostLivenessTracker(clock);
}
+ private void initializeMaintainers(Map<ApplicationId, MockDeployer.ApplicationContext> apps) {
+ deployer = new MockDeployer(provisioner, tester.clock(), apps);
+ serviceMonitor = new ServiceMonitorStub(apps, nodeRepository);
+ metric = new TestMetric();
+ failer = createFailer();
+ updater = createUpdater();
+ }
+
public static NodeFailTester withTwoApplications() {
NodeFailTester tester = new NodeFailTester();
@@ -99,10 +108,7 @@ public class NodeFailTester {
Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(
app1, new MockDeployer.ApplicationContext(app1, clusterApp1, capacity1),
app2, new MockDeployer.ApplicationContext(app2, clusterApp2, capacity2));
- tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps);
- tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository);
- tester.metric = new TestMetric();
- tester.failer = tester.createFailer();
+ tester.initializeMaintainers(apps);
return tester;
}
@@ -135,10 +141,7 @@ public class NodeFailTester {
tenantHostApp, new MockDeployer.ApplicationContext(tenantHostApp, clusterNodeAdminApp, allHosts),
app1, new MockDeployer.ApplicationContext(app1, clusterApp1, capacity1),
app2, new MockDeployer.ApplicationContext(app2, clusterApp2, capacity2));
- tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps);
- tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository);
- tester.metric = new TestMetric();
- tester.failer = tester.createFailer();
+ tester.initializeMaintainers(apps);
return tester;
}
@@ -148,10 +151,7 @@ public class NodeFailTester {
// Create applications
ClusterSpec clusterApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("test")).vespaVersion("6.42").build();
Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, clusterApp, capacity));
- tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps);
- tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository);
- tester.metric = new TestMetric();
- tester.failer = tester.createFailer();
+ tester.initializeMaintainers(apps);
return tester;
}
@@ -167,22 +167,21 @@ public class NodeFailTester {
Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(
app1, new MockDeployer.ApplicationContext(app1, clusterApp1, allNodes));
- tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps);
- tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository);
- tester.metric = new TestMetric();
- tester.failer = tester.createFailer();
+ tester.initializeMaintainers(apps);
return tester;
}
public static NodeFailTester withNoApplications() {
NodeFailTester tester = new NodeFailTester();
- tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), Map.of());
- tester.serviceMonitor = new ServiceMonitorStub(Map.of(), tester.nodeRepository);
- tester.metric = new TestMetric();
- tester.failer = tester.createFailer();
+ tester.initializeMaintainers(Map.of());
return tester;
}
+ public void runMaintainers() {
+ updater.maintain();
+ failer.maintain();
+ }
+
public void suspend(ApplicationId app) {
try {
orchestrator.suspend(app);
@@ -200,10 +199,14 @@ public class NodeFailTester {
}
public NodeFailer createFailer() {
- return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, downtimeLimitOneHour,
+ return new NodeFailer(deployer, nodeRepository, downtimeLimitOneHour,
Duration.ofMinutes(5), clock, orchestrator, NodeFailer.ThrottlePolicy.hosted, metric);
}
+ public NodeFailureStatusUpdater createUpdater() {
+ return new NodeFailureStatusUpdater(hostLivenessTracker, serviceMonitor, nodeRepository, Duration.ofMinutes(5), metric);
+ }
+
public void allNodesMakeAConfigRequestExcept(Node ... deadNodeArray) {
allNodesMakeAConfigRequestExcept(List.of(deadNodeArray));
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index cd2acebf04f..dcb70eed099 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -70,10 +70,10 @@ public class NodeFailerTest {
// Suspend the first of the active nodes
tester.suspend(hostnamesByState.get(Node.State.active).get(0));
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
// The first (and the only) ready node and the 1st active node that was allowed to fail should be failed
Map<Node.State, List<String>> expectedHostnamesByState1Iter = Map.of(
@@ -88,7 +88,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
// All of the children should be failed now
Set<Node.State> childStates2Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream()
@@ -98,7 +98,7 @@ public class NodeFailerTest {
assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithHwFailure).get().state());
tester.suspend(hostWithHwFailure);
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(Node.State.failed, tester.nodeRepository.getNode(hostWithHwFailure).get().state());
assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size());
}
@@ -129,7 +129,7 @@ public class NodeFailerTest {
// The ready node will be failed, but neither the host nor the 2 active nodes since they have not been suspended
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state());
assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state());
@@ -139,7 +139,7 @@ public class NodeFailerTest {
tester.suspend(hostWithFailureReports);
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state());
assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state());
@@ -149,7 +149,7 @@ public class NodeFailerTest {
tester.suspend(activeChild1);
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state());
assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state());
@@ -159,7 +159,7 @@ public class NodeFailerTest {
tester.suspend(activeChild2);
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state());
@@ -176,9 +176,9 @@ public class NodeFailerTest {
String host_from_normal_app = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname();
tester.serviceMonitor.setHostDown(host_from_suspended_app);
tester.serviceMonitor.setHostDown(host_from_normal_app);
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(65));
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(Node.State.failed, tester.nodeRepository.getNode(host_from_normal_app).get().state());
assertEquals(Node.State.active, tester.nodeRepository.getNode(host_from_suspended_app).get().state());
@@ -190,7 +190,7 @@ public class NodeFailerTest {
// For a day all nodes work so nothing happens
for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
@@ -206,7 +206,7 @@ public class NodeFailerTest {
tester.nodeRepository.write(readyFail1.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
tester.nodeRepository.write(readyFail2.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state());
@@ -217,7 +217,7 @@ public class NodeFailerTest {
tester.serviceMonitor.setHostDown(downHost2);
// nothing happens the first 45 minutes
for (int minutes = 0; minutes < 45; minutes +=5 ) {
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals( 0, tester.deployer.redeployments);
@@ -230,7 +230,7 @@ public class NodeFailerTest {
// downHost2 should now be failed and replaced, but not downHost1
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals( 1, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
@@ -239,17 +239,17 @@ public class NodeFailerTest {
// downHost1 fails again
tester.serviceMonitor.setHostDown(downHost1);
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
// the system goes down
tester.clock.advance(Duration.ofMinutes(120));
tester.failer = tester.createFailer();
- tester.failer.run();
+ tester.runMaintainers();
// the host is still down and fails
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals( 2, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
@@ -260,7 +260,7 @@ public class NodeFailerTest {
tester.serviceMonitor.setHostDown(lastNode.hostname());
// it is not failed because there are no ready nodes to replace it
for (int minutes = 0; minutes < 75; minutes +=5 ) {
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals( 2, tester.deployer.redeployments);
@@ -273,7 +273,7 @@ public class NodeFailerTest {
tester.createReadyNodes(1, 16, NodeFailTester.nodeResources);
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
// The node is now failed
assertEquals( 3, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
@@ -300,7 +300,7 @@ public class NodeFailerTest {
// nothing happens the first 45 minutes
for (int minutes = 0; minutes < 45; minutes +=5 ) {
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(0, tester.deployer.redeployments);
@@ -311,7 +311,7 @@ public class NodeFailerTest {
// downHost should now be failed and replaced
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(1, tester.deployer.redeployments);
assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
@@ -330,7 +330,7 @@ public class NodeFailerTest {
for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
@@ -343,14 +343,14 @@ public class NodeFailerTest {
.filter(node -> ! node.resources().equals(newNodeResources))
.collect(Collectors.toList());
tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode);
- tester.failer.run();
+ tester.runMaintainers();
assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
// Another ready node dies and the node that died earlier, are allowed to fail
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode, otherNodes.get(3));
- tester.failer.run();
+ tester.runMaintainers();
assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(otherNodes.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0));
assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
@@ -364,7 +364,7 @@ public class NodeFailerTest {
for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size());
assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size());
}
@@ -373,7 +373,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(180));
Node dockerHost = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).iterator().next();
tester.allNodesMakeAConfigRequestExcept(dockerHost);
- tester.failer.run();
+ tester.runMaintainers();
assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size());
assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size());
}
@@ -386,7 +386,7 @@ public class NodeFailerTest {
for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
@@ -399,7 +399,7 @@ public class NodeFailerTest {
// nothing happens the first 45 minutes
for (int minutes = 0; minutes < 45; minutes += 5 ) {
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(0, tester.deployer.redeployments);
@@ -410,7 +410,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(30));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(2 + 1, tester.deployer.redeployments);
assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
@@ -425,7 +425,7 @@ public class NodeFailerTest {
// nothing happens during the entire day because of the failure throttling
for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size());
@@ -433,7 +433,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(30));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(3 + 1, tester.deployer.redeployments);
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
@@ -445,10 +445,10 @@ public class NodeFailerTest {
// Lets fail another host, make sure it is not the same where downTenant1 is a child
String downHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());
tester.serviceMonitor.setHostDown(downHost2);
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(90));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(5 + 2, tester.deployer.redeployments);
assertEquals(7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
@@ -460,10 +460,10 @@ public class NodeFailerTest {
// node, while app2's should remain
String downHost3 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());
tester.serviceMonitor.setHostDown(downHost3);
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(6 + 2, tester.deployer.redeployments);
assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
@@ -487,7 +487,7 @@ public class NodeFailerTest {
// For a day all nodes work so nothing happens
for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
@@ -500,7 +500,7 @@ public class NodeFailerTest {
tester.serviceMonitor.setHostDown(downHost);
// nothing happens the first 45 minutes
for (int minutes = 0; minutes < 45; minutes +=5 ) {
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals( 0, tester.deployer.redeployments);
@@ -508,7 +508,7 @@ public class NodeFailerTest {
}
tester.clock.advance(Duration.ofMinutes(60));
- tester.failer.run();
+ tester.runMaintainers();
// one down host should now be failed, but not two as we are only allowed to fail one proxy
assertEquals(expectedFailCount, tester.deployer.redeployments);
@@ -519,7 +519,7 @@ public class NodeFailerTest {
// trying to fail again will still not fail the other down host
tester.clock.advance(Duration.ofMinutes(60));
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(count - expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.active).size());
}
@@ -529,12 +529,12 @@ public class NodeFailerTest {
Node readyNode = tester.createReadyNodes(1).get(0);
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(Node.State.ready, readyNode.state());
tester.nodeRepository.write(readyNode.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(1, tester.nodeRepository.getNodes(Node.State.failed).size());
}
@@ -555,7 +555,7 @@ public class NodeFailerTest {
}
// 2 nodes are failed (the minimum amount that are always allowed to fail)
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
@@ -565,7 +565,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
@@ -575,7 +575,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size());
// 24 more hours pass, nothing happens
@@ -590,11 +590,11 @@ public class NodeFailerTest {
tester.serviceMonitor.setHostDown(host.hostname());
deadNodes.add(host);
});
- tester.failer.run();
+ tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(61));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(4 + /* already failed */
2 + /* hosts */
(2 * 3) /* containers per host */,
@@ -607,14 +607,14 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric));
// The final host and its containers are failed out
tester.clock.advance(Duration.ofMinutes(30));
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
@@ -622,7 +622,7 @@ public class NodeFailerTest {
// Nothing else to fail
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
@@ -639,7 +639,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
- tester.failer.run();
+ tester.runMaintainers();
// 2% are allowed to fail
assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
@@ -650,7 +650,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
@@ -660,7 +660,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
- tester.failer.run();
+ tester.runMaintainers();
assertEquals(15, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
@@ -694,7 +694,7 @@ public class NodeFailerTest {
addServiceInstances(services, ServiceStatus.NOT_CHECKED, numNotChecked);
Collections.shuffle(services);
- return NodeFailer.badNode(services);
+ return NodeFailureStatusUpdater.badNode(services);
}
/**
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json
index 55d083c877d..0d9ca44a293 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json
@@ -25,6 +25,9 @@
"name": "NodeFailer"
},
{
+ "name": "NodeFailureStatusUpdater"
+ },
+ {
"name": "NodeMetricsDbMaintainer"
},
{