aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2022-10-13 21:10:41 +0200
committerGitHub <noreply@github.com>2022-10-13 21:10:41 +0200
commita5ed12b351806b187613457b58982ca67f537594 (patch)
tree84e4beabd59ca6a1440607b5ba28468e92a83e48 /node-repository
parent6e7459c9eff635d9c8227cd8d1add320f298c0e2 (diff)
Revert "Remove HostLivenessTracker"
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java63
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java30
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/TestHostLivenessTracker.java32
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java42
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java185
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java4
10 files changed, 307 insertions, 69 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 32eac49a288..3e7abe8f053 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -77,6 +77,23 @@ public class NodeFailer extends NodeRepositoryMaintainer {
int throttledHostFailures = 0;
int throttledNodeFailures = 0;
+ // Ready nodes
+ try (Mutex lock = nodeRepository().nodes().lockUnallocated()) {
+ for (FailingNode failing : findReadyFailingNodes()) {
+ attempts++;
+ if (throttle(failing.node())) {
+ failures++;
+ if (failing.node().type().isHost())
+ throttledHostFailures++;
+ else
+ throttledNodeFailures++;
+ continue;
+ }
+ nodeRepository().nodes().fail(failing.node().hostname(), Agent.NodeFailer, failing.reason());
+ }
+ }
+
+ // Active nodes
for (FailingNode failing : findActiveFailingNodes()) {
attempts++;
if (!failAllowedFor(failing.node().type())) continue;
@@ -99,6 +116,22 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return asSuccessFactor(attempts, failures);
}
+ private Collection<FailingNode> findReadyFailingNodes() {
+ Set<FailingNode> failingNodes = new HashSet<>();
+ for (Node node : nodeRepository().nodes().list(Node.State.ready)) {
+ Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
+ List<String> failureReports = reasonsToFailHost(hostNode);
+ if (failureReports.size() > 0) {
+ if (hostNode.equals(node)) {
+ failingNodes.add(new FailingNode(node, "Host has failure reports: " + failureReports));
+ } else {
+ failingNodes.add(new FailingNode(node, "Parent (" + hostNode + ") has failure reports: " + failureReports));
+ }
+ }
+ }
+ return failingNodes;
+ }
+
private Collection<FailingNode> findActiveFailingNodes() {
Set<FailingNode> failingNodes = new HashSet<>();
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
@@ -117,7 +150,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
for (Node node : activeNodes) {
if (allSuspended(node, activeNodes)) {
- Node host = node.parentHostname().flatMap(activeNodes::node).orElse(node);
+ Node host = node.parentHostname().flatMap(parent -> activeNodes.node(parent)).orElse(node);
if (host.type().isHost()) {
List<String> failureReports = reasonsToFailHost(host);
if ( ! failureReports.isEmpty()) {
@@ -142,7 +175,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
/** Returns whether node has any kind of hardware issue */
static boolean hasHardwareIssue(Node node, NodeList allNodes) {
- Node host = node.parentHostname().flatMap(allNodes::node).orElse(node);
+ Node host = node.parentHostname().flatMap(parent -> allNodes.node(parent)).orElse(node);
return reasonsToFailHost(host).size() > 0;
}
@@ -311,6 +344,30 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
- private record FailingNode(Node node, String reason) { }
+ private static class FailingNode {
+
+ private final Node node;
+ private final String reason;
+
+ public FailingNode(Node node, String reason) {
+ this.node = node;
+ this.reason = reason;
+ }
+
+ public Node node() { return node; }
+ public String reason() { return reason; }
+
+ @Override
+ public boolean equals(Object other) {
+ if ( ! (other instanceof FailingNode)) return false;
+ return ((FailingNode)other).node().equals(this.node());
+ }
+
+ @Override
+ public int hashCode() {
+ return node.hashCode();
+ }
+
+ }
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
index 624492a14f3..b43e2ae051f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ApplicationLockException;
+import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.jdisc.Metric;
import com.yahoo.lang.MutableInteger;
import com.yahoo.transaction.Mutex;
@@ -12,10 +13,12 @@ import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
+import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
+import java.time.Instant;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -31,18 +34,41 @@ import static java.util.stream.Collectors.counting;
*/
public class NodeHealthTracker extends NodeRepositoryMaintainer {
+ /** Provides information about the status of ready hosts */
+ private final HostLivenessTracker hostLivenessTracker;
+
/** Provides (more accurate) information about the status of active hosts */
private final ServiceMonitor serviceMonitor;
- public NodeHealthTracker(ServiceMonitor serviceMonitor, NodeRepository nodeRepository,
+ public NodeHealthTracker(HostLivenessTracker hostLivenessTracker,
+ ServiceMonitor serviceMonitor, NodeRepository nodeRepository,
Duration interval, Metric metric) {
super(nodeRepository, interval, metric);
+ this.hostLivenessTracker = hostLivenessTracker;
this.serviceMonitor = serviceMonitor;
}
@Override
protected double maintain() {
- return updateActiveNodeDownState();
+ return ( updateReadyNodeLivenessEvents() + updateActiveNodeDownState() ) / 2;
+ }
+
+ private double updateReadyNodeLivenessEvents() {
+ // Update node last request events through ZooKeeper to collect request to all config servers.
+ // We do this here ("lazily") to avoid writing to zk for each config request.
+ try (Mutex lock = nodeRepository().nodes().lockUnallocated()) {
+ for (Node node : nodeRepository().nodes().list(Node.State.ready)) {
+ Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname());
+ if (lastLocalRequest.isEmpty()) continue;
+
+ if (!node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) {
+ History updatedHistory = node.history()
+ .with(new History.Event(History.Event.Type.requested, Agent.NodeHealthTracker, lastLocalRequest.get()));
+ nodeRepository().nodes().write(node.with(updatedHistory), lock);
+ }
+ }
+ }
+ return 1.0;
}
/**
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 708d8b59eb0..dac6ee61ef3 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -5,6 +5,7 @@ import com.yahoo.component.AbstractComponent;
import com.yahoo.component.annotation.Inject;
import com.yahoo.concurrent.maintenance.Maintainer;
import com.yahoo.config.provision.Deployer;
+import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.InfraDeployer;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.Zone;
@@ -32,7 +33,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
@SuppressWarnings("unused")
@Inject
public NodeRepositoryMaintenance(NodeRepository nodeRepository, Deployer deployer, InfraDeployer infraDeployer,
- ServiceMonitor serviceMonitor,
+ HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor,
Zone zone, Metric metric,
ProvisionServiceProvider provisionServiceProvider, FlagSource flagSource,
MetricsFetcher metricsFetcher) {
@@ -45,7 +46,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
maintainers.add(infrastructureProvisioner);
maintainers.add(new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, defaults.throttlePolicy, metric));
- maintainers.add(new NodeHealthTracker(serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric));
+ maintainers.add(new NodeHealthTracker(hostLivenessTracker, serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric));
maintainers.add(new ExpeditedChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.expeditedChangeRedeployInterval));
maintainers.add(new ReservationExpirer(nodeRepository, defaults.reservationExpiry, metric));
maintainers.add(new RetiredExpirer(nodeRepository, deployer, metric, defaults.retiredInterval, defaults.retiredExpiry));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
index 4aca1cbd056..14fa2e1c8ff 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
@@ -46,10 +46,8 @@ public class History {
private static ImmutableMap<Event.Type, Event> toImmutableMap(Collection<Event> events) {
ImmutableMap.Builder<Event.Type, Event> builder = new ImmutableMap.Builder<>();
- for (Event event : events) {
- if (event.type() == Event.Type.requested) continue; // TODO (freva): Remove requested event after 8.70
+ for (Event event : events)
builder.put(event.type(), event);
- }
return builder.build();
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
index a5ebc6b3efc..b4e304155a6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
@@ -1,7 +1,6 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.persistence;
-import ai.vespa.http.DomainName;
import com.yahoo.component.Version;
import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.provision.ApplicationId;
@@ -485,12 +484,15 @@ public class CuratorDatabaseClient {
transaction.onCommitted(() -> {
for (var lb : loadBalancers) {
if (lb.state() == fromState) continue;
- Optional<String> target = lb.instance().flatMap(instance -> instance.hostname().map(DomainName::value).or(instance::ipAddress));
if (fromState == null) {
- log.log(Level.INFO, () -> "Creating " + lb.id() + target.map(t -> " (" + t + ")").orElse("") +
+ log.log(Level.INFO, () -> "Creating " + lb.id() + lb.instance()
+ .map(instance -> " (" + instance.hostname() + ")")
+ .orElse("") +
" in " + lb.state());
} else {
- log.log(Level.INFO, () -> "Moving " + lb.id() + target.map(t -> " (" + t + ")").orElse("") +
+ log.log(Level.INFO, () -> "Moving " + lb.id() + lb.instance()
+ .map(instance -> " (" + instance.hostname() + ")")
+ .orElse("") +
" from " + fromState +
" to " + lb.state());
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java
index 024f071abb1..83a80847ec8 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ContainerConfig.java
@@ -21,6 +21,7 @@ public class ContainerConfig {
" <component id='com.yahoo.vespa.hosted.provision.testutils.MockDeployer'/>\n" +
" <component id='com.yahoo.vespa.hosted.provision.testutils.MockInfraDeployer'/>\n" +
" <component id='com.yahoo.vespa.hosted.provision.testutils.MockProvisioner'/>\n" +
+ " <component id='com.yahoo.vespa.hosted.provision.testutils.TestHostLivenessTracker'/>\n" +
" <component id='com.yahoo.vespa.hosted.provision.testutils.ServiceMonitorStub'/>\n" +
" <component id='com.yahoo.vespa.hosted.provision.testutils.MockDuperModel'/>\n" +
" <component id='com.yahoo.vespa.hosted.provision.testutils.MockNodeFlavors'/>\n" +
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/TestHostLivenessTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/TestHostLivenessTracker.java
new file mode 100644
index 00000000000..28d0d5f89d7
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/TestHostLivenessTracker.java
@@ -0,0 +1,32 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.provision.testutils;
+
+import com.yahoo.config.provision.HostLivenessTracker;
+
+import java.time.Clock;
+import java.time.Instant;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Optional;
+
+/** This is a fully functional implementation */
+public class TestHostLivenessTracker implements HostLivenessTracker {
+
+ private final Clock clock;
+ private final Map<String, Instant> lastRequestFromHost = new HashMap<>();
+
+ public TestHostLivenessTracker(Clock clock) {
+ this.clock = clock;
+ }
+
+ @Override
+ public void receivedRequestFrom(String hostname) {
+ lastRequestFromHost.put(hostname, clock.instant());
+ }
+
+ @Override
+ public Optional<Instant> lastRequestFrom(String hostname) {
+ return Optional.ofNullable(lastRequestFromHost.get(hostname));
+ }
+
+}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 977d72c11ea..f67e9cd8345 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -27,6 +27,7 @@ import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
import com.yahoo.vespa.hosted.provision.testutils.MockDeployer;
import com.yahoo.vespa.hosted.provision.testutils.ServiceMonitorStub;
+import com.yahoo.vespa.hosted.provision.testutils.TestHostLivenessTracker;
import java.time.Clock;
import java.time.Duration;
@@ -62,6 +63,7 @@ public class NodeFailTester {
public ServiceMonitorStub serviceMonitor;
public MockDeployer deployer;
public TestMetric metric;
+ private final TestHostLivenessTracker hostLivenessTracker;
private final NodeRepositoryProvisioner provisioner;
private final Curator curator;
@@ -72,6 +74,7 @@ public class NodeFailTester {
curator = tester.getCurator();
nodeRepository = tester.nodeRepository();
provisioner = tester.provisioner();
+ hostLivenessTracker = new TestHostLivenessTracker(clock);
}
private void initializeMaintainers(Map<ApplicationId, MockDeployer.ApplicationContext> apps) {
@@ -109,7 +112,7 @@ public class NodeFailTester {
/** Create hostCount hosts, one app with containerCount containers, and one app with contentCount content nodes. */
public static NodeFailTester withTwoApplications(int hostCount, int containerCount, int contentCount) {
NodeFailTester tester = new NodeFailTester();
- tester.tester.makeReadyHosts(hostCount, new NodeResources(2, 8, 20, 10));
+ tester.createHostNodes(hostCount);
// Create tenant host application
ClusterSpec clusterNodeAdminApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("node-admin")).vespaVersion("6.42").build();
@@ -136,7 +139,13 @@ public class NodeFailTester {
public static NodeFailTester withTwoApplications(int numberOfHosts) {
NodeFailTester tester = new NodeFailTester();
- tester.tester.makeReadyNodes(numberOfHosts, new NodeResources(4, 16, 400, 10), NodeType.host, 8);
+
+ int nodesPerHost = 3;
+ List<Node> hosts = tester.createHostNodes(numberOfHosts);
+ for (int i = 0; i < hosts.size(); i++) {
+ tester.createReadyNodes(nodesPerHost, i * nodesPerHost, Optional.of("parent" + (i + 1)),
+ new NodeResources(1, 4, 100, 0.3), NodeType.tenant);
+ }
// Create applications
ClusterSpec clusterNodeAdminApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("node-admin")).vespaVersion("6.42").build();
@@ -221,11 +230,26 @@ public class NodeFailTester {
}
public NodeHealthTracker createUpdater() {
- return new NodeHealthTracker(serviceMonitor, nodeRepository, Duration.ofMinutes(5), metric);
+ return new NodeHealthTracker(hostLivenessTracker, serviceMonitor, nodeRepository, Duration.ofMinutes(5), metric);
+ }
+
+ public void allNodesMakeAConfigRequestExcept(Node ... deadNodeArray) {
+ allNodesMakeAConfigRequestExcept(List.of(deadNodeArray));
+ }
+
+ public void allNodesMakeAConfigRequestExcept(List<Node> deadNodes) {
+ for (Node node : nodeRepository.nodes().list()) {
+ if ( ! deadNodes.contains(node))
+ hostLivenessTracker.receivedRequestFrom(node.hostname());
+ }
}
public Clock clock() { return clock; }
+ public List<Node> createReadyNodes(int count) {
+ return createReadyNodes(count, 0);
+ }
+
public List<Node> createReadyNodes(int count, NodeResources resources) {
return createReadyNodes(count, 0, resources);
}
@@ -234,10 +258,22 @@ public class NodeFailTester {
return createReadyNodes(count, 0, Optional.empty(), hostFlavors.getFlavorOrThrow("default"), nodeType);
}
+ public List<Node> createReadyNodes(int count, int startIndex) {
+ return createReadyNodes(count, startIndex, "default");
+ }
+
+ public List<Node> createReadyNodes(int count, int startIndex, String flavor) {
+ return createReadyNodes(count, startIndex, Optional.empty(), hostFlavors.getFlavorOrThrow(flavor), NodeType.tenant);
+ }
+
public List<Node> createReadyNodes(int count, int startIndex, NodeResources resources) {
return createReadyNodes(count, startIndex, Optional.empty(), new Flavor(resources), NodeType.tenant);
}
+ private List<Node> createReadyNodes(int count, int startIndex, Optional<String> parentHostname, NodeResources resources, NodeType nodeType) {
+ return createReadyNodes(count, startIndex, parentHostname, new Flavor(resources), nodeType);
+ }
+
private List<Node> createReadyNodes(int count, int startIndex, Optional<String> parentHostname, Flavor flavor, NodeType nodeType) {
List<Node> nodes = new ArrayList<>(count);
int lastOctetOfPoolAddress = 0;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index a21edf31cb8..3ba536ee4d7 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Report;
+import com.yahoo.vespa.hosted.provision.node.Reports;
import org.junit.Test;
import java.time.Duration;
@@ -20,7 +21,6 @@ import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
-import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -62,21 +62,23 @@ public class NodeFailerTest {
}
private void testNodeFailingWith(NodeFailTester tester, String hostWithHwFailure) {
- // The host should have 2 nodes in active
+ // The host should have 2 nodes in active and 1 ready
Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.nodes().list().childrenOf(hostWithHwFailure).asList().stream()
.collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
assertEquals(2, hostnamesByState.get(Node.State.active).size());
+ assertEquals(1, hostnamesByState.get(Node.State.ready).size());
// Suspend the first of the active nodes
tester.suspend(hostnamesByState.get(Node.State.active).get(0));
tester.runMaintainers();
tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
// The first (and the only) ready node and the 1st active node that was allowed to fail should be failed
Map<Node.State, List<String>> expectedHostnamesByState1Iter = Map.of(
- Node.State.failed, List.of(hostnamesByState.get(Node.State.active).get(0)),
+ Node.State.failed, List.of(hostnamesByState.get(Node.State.ready).get(0), hostnamesByState.get(Node.State.active).get(0)),
Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2));
Map<Node.State, List<String>> hostnamesByState1Iter = tester.nodeRepository.nodes().list().childrenOf(hostWithHwFailure).asList().stream()
.collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
@@ -86,6 +88,7 @@ public class NodeFailerTest {
tester.suspend(hostnamesByState.get(Node.State.active).get(1));
tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
// All of the children should be failed now
@@ -98,7 +101,7 @@ public class NodeFailerTest {
tester.suspend(hostWithHwFailure);
tester.runMaintainers();
assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(hostWithHwFailure).get().state());
- assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(4, tester.nodeRepository.nodes().list(Node.State.failed).size());
}
@Test
@@ -107,12 +110,14 @@ public class NodeFailerTest {
String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state());
- // The host has 2 nodes in active
+ // The host has 2 nodes in active and 1 ready
Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.nodes().list().childrenOf(hostWithFailureReports).asList().stream()
.collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
assertEquals(2, hostnamesByState.get(Node.State.active).size());
String activeChild1 = hostnamesByState.get(Node.State.active).get(0);
String activeChild2 = hostnamesByState.get(Node.State.active).get(1);
+ assertEquals(1, hostnamesByState.get(Node.State.ready).size());
+ String readyChild = hostnamesByState.get(Node.State.ready).get(0);
// Set failure report to the parent and all its children.
Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low");
@@ -123,16 +128,20 @@ public class NodeFailerTest {
tester.nodeRepository.nodes().write(updatedNode, () -> {});
});
- // Neither the host nor the 2 active nodes are failed out because they have not been suspended
+ // The ready node will be failed, but neither the host nor the 2 active nodes since they have not been suspended
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
+ assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyChild).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild1).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild2).get().state());
- // Suspending the host will not fail any more since none of the children are suspended
+ // Suspending the host will not fail any more since none of the children are suspened
tester.suspend(hostWithFailureReports);
tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
+ assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyChild).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild1).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild2).get().state());
@@ -140,7 +149,9 @@ public class NodeFailerTest {
// Suspending one child node will fail that out.
tester.suspend(activeChild1);
tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
+ assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyChild).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(activeChild1).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(activeChild2).get().state());
@@ -148,7 +159,9 @@ public class NodeFailerTest {
// Suspending the second child node will fail that out and the host.
tester.suspend(activeChild2);
tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
+ assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyChild).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(hostWithFailureReports).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(activeChild1).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(activeChild2).get().state());
@@ -197,18 +210,31 @@ public class NodeFailerTest {
@Test
public void node_failing() {
- NodeFailTester tester = NodeFailTester.withTwoApplications(6);
+ NodeFailTester tester = NodeFailTester.withTwoApplications();
// For a day all nodes work so nothing happens
for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
+ tester.allNodesMakeAConfigRequestExcept();
- assertEquals(0, tester.deployer.redeployments);
- assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
- assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 0, tester.deployer.redeployments);
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 4, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
}
+ // Hardware failures are detected on two ready nodes, which are then failed
+ Node readyFail1 = tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).asList().get(2);
+ Node readyFail2 = tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).asList().get(3);
+ tester.nodeRepository.nodes().write(readyFail1.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
+ tester.nodeRepository.nodes().write(readyFail2.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
+ assertEquals(4, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
+ tester.runMaintainers();
+ assertEquals(2, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyFail1.hostname()).get().state());
+ assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(readyFail2.hostname()).get().state());
+
String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname();
tester.serviceMonitor.setHostDown(downHost1);
@@ -217,34 +243,41 @@ public class NodeFailerTest {
for (int minutes = 0; minutes < 45; minutes +=5 ) {
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
- assertEquals(0, tester.deployer.redeployments);
- assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
- assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ tester.allNodesMakeAConfigRequestExcept();
+ assertEquals( 0, tester.deployer.redeployments);
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals( 2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 2, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
}
tester.serviceMonitor.setHostUp(downHost1);
// downHost2 should now be failed and replaced, but not downHost1
tester.clock.advance(Duration.ofDays(1));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
- assertEquals(1, tester.deployer.redeployments);
- assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
- assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 1, tester.deployer.redeployments);
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals( 3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 1, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(downHost2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).asList().get(0).hostname());
// downHost1 fails again
tester.serviceMonitor.setHostDown(downHost1);
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
+ tester.allNodesMakeAConfigRequestExcept();
// the system goes down
tester.clock.advance(Duration.ofMinutes(120));
tester.failer = tester.createFailer();
tester.runMaintainers();
// the host is still down and fails
tester.clock.advance(Duration.ofMinutes(5));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
- assertEquals(2, tester.deployer.redeployments);
- assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
- assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 2, tester.deployer.redeployments);
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals( 4, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
// the last host goes down
Node lastNode = tester.highestIndex(tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1));
@@ -253,19 +286,23 @@ public class NodeFailerTest {
for (int minutes = 0; minutes < 75; minutes +=5 ) {
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
- assertEquals(2, tester.deployer.redeployments);
- assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
- assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ tester.allNodesMakeAConfigRequestExcept();
+ assertEquals( 2, tester.deployer.redeployments);
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals( 4, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
}
// A new node is available
tester.createReadyNodes(1, 16, NodeFailTester.nodeResources);
tester.clock.advance(Duration.ofDays(1));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
// The node is now failed
- assertEquals(3, tester.deployer.redeployments);
- assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
- assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 3, tester.deployer.redeployments);
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals( 5, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertTrue("The index of the last failed node is not reused",
tester.highestIndex(tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1)).allocation().get().membership().index()
>
@@ -282,10 +319,12 @@ public class NodeFailerTest {
String downNode = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
tester.serviceMonitor.setHostDown(downNode);
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
tester.clock.advance(Duration.ofMinutes(75));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state());
@@ -293,10 +332,12 @@ public class NodeFailerTest {
// Re-activate the node. It is still down, but should not be failed out until the grace period has passed again
tester.nodeRepository.nodes().reactivate(downNode, Agent.system, getClass().getSimpleName());
tester.clock.advance(Duration.ofMinutes(30));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
tester.clock.advance(Duration.ofMinutes(45));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state());
@@ -319,6 +360,7 @@ public class NodeFailerTest {
for (int minutes = 0; minutes < 45; minutes +=5 ) {
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
+ tester.allNodesMakeAConfigRequestExcept();
assertEquals(0, tester.deployer.redeployments);
assertEquals(3, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
@@ -326,6 +368,7 @@ public class NodeFailerTest {
// downHost should now be failed and replaced
tester.clock.advance(Duration.ofDays(1));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(1, tester.deployer.redeployments);
assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
@@ -360,15 +403,37 @@ public class NodeFailerTest {
}
@Test
+ public void host_not_failed_without_config_requests() {
+ NodeFailTester tester = NodeFailTester.withTwoApplications();
+
+ // For a day all nodes work so nothing happens
+ for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
+ tester.clock.advance(Duration.ofMinutes(interval));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals( 3, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.host).size());
+ assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size());
+ }
+
+ tester.clock.advance(Duration.ofMinutes(180));
+ Node host = tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.host).first().get();
+ tester.allNodesMakeAConfigRequestExcept(host);
+ tester.runMaintainers();
+ assertEquals( 3, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.host).size());
+ assertEquals( 0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size());
+ }
+
+ @Test
public void failing_hosts() {
NodeFailTester tester = NodeFailTester.withTwoApplications(7);
// For a day all nodes work so nothing happens
for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
- assertEquals(0, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
+ assertEquals(13, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
}
@@ -381,17 +446,21 @@ public class NodeFailerTest {
for (int minutes = 0; minutes < 45; minutes += 5 ) {
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
+ tester.allNodesMakeAConfigRequestExcept();
assertEquals(0, tester.deployer.redeployments);
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(13, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
}
tester.clock.advance(Duration.ofMinutes(30));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(2, tester.deployer.redeployments);
- assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size());
@@ -399,8 +468,9 @@ public class NodeFailerTest {
tester.runMaintainers();
assertEquals(2 + 1, tester.deployer.redeployments);
- assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(6, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size());
@@ -413,15 +483,18 @@ public class NodeFailerTest {
for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(interval));
- assertEquals(2 + 1, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ tester.allNodesMakeAConfigRequestExcept();
+ assertEquals(3 + 1, tester.nodeRepository.nodes().list(Node.State.failed).size());
}
tester.clock.advance(Duration.ofMinutes(30));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(3 + 1, tester.deployer.redeployments);
- assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals(4, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(9, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(6, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
@@ -430,12 +503,14 @@ public class NodeFailerTest {
tester.serviceMonitor.setHostDown(downHost2);
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(90));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
tester.runMaintainers(); // The host is failed in the 2. maintain()
assertEquals(5 + 2, tester.deployer.redeployments);
- assertEquals(5, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals(7, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(6, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(5, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
@@ -445,11 +520,13 @@ public class NodeFailerTest {
tester.serviceMonitor.setHostDown(downHost3);
tester.runMaintainers();
tester.clock.advance(Duration.ofDays(1));
+ tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
assertEquals(6 + 2, tester.deployer.redeployments);
- assertEquals(6, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals(9, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(4, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(5, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
}
@@ -470,6 +547,7 @@ public class NodeFailerTest {
for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
+ tester.allNodesMakeAConfigRequestExcept();
assertEquals(count, tester.nodeRepository.nodes().list(Node.State.active).nodeType(nodeType).size());
}
@@ -482,6 +560,7 @@ public class NodeFailerTest {
for (int minutes = 0; minutes < 45; minutes +=5 ) {
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(5));
+ tester.allNodesMakeAConfigRequestExcept();
assertEquals( 0, tester.deployer.redeployments);
assertEquals(count, tester.nodeRepository.nodes().list(Node.State.active).nodeType(nodeType).size());
}
@@ -503,30 +582,38 @@ public class NodeFailerTest {
}
@Test
+ public void failing_divergent_ready_nodes() {
+ NodeFailTester tester = NodeFailTester.withNoApplications();
+
+ Node readyNode = tester.createReadyNodes(1).get(0);
+
+ tester.runMaintainers();
+ assertEquals(Node.State.ready, readyNode.state());
+
+ tester.nodeRepository.nodes().write(readyNode.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
+
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ }
+
+ @Test
public void node_failing_throttle() {
// Throttles based on an absolute number in small zone
{
- // 10 hosts with 7 container and 7 content nodes, total 24 nodes
- NodeFailTester tester = NodeFailTester.withTwoApplications(10, 7, 7);
-
- List<String> failedHostHostnames = tester.nodeRepository.nodes().list().stream()
- .flatMap(node -> node.parentHostname().stream())
- .collect(Collectors.groupingBy(h -> h, Collectors.counting()))
- .entrySet().stream()
- .sorted(Comparator.comparingLong((Map.Entry<String, Long> e) -> e.getValue()).reversed())
- .limit(3)
- .map(Map.Entry::getKey)
- .toList();
+ // 10 hosts with 3 tenant nodes each, total 40 nodes
+ NodeFailTester tester = NodeFailTester.withTwoApplications(10);
+ NodeList hosts = tester.nodeRepository.nodes().list().nodeType(NodeType.host);
// 3 hosts fail. 2 of them and all of their children are allowed to fail
- failedHostHostnames.forEach(hostname -> tester.serviceMonitor.setHostDown(hostname));
+ List<Node> failedHosts = hosts.asList().subList(0, 3);
+ failedHosts.forEach(host -> tester.serviceMonitor.setHostDown(host.hostname()));
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(61));
tester.runMaintainers();
tester.runMaintainers(); // hosts are typically failed in the 2. maintain()
assertEquals(2 + /* hosts */
- (2 * 2) /* containers per host */,
+ (2 * 3) /* containers per host */,
tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric));
@@ -536,7 +623,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(interval));
}
tester.runMaintainers();
- assertEquals(6, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(8, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric));
@@ -544,14 +631,14 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(30));
tester.runMaintainers();
tester.runMaintainers(); // hosts are failed in the 2. maintain()
- assertEquals(9, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// Nothing else to fail
tester.clock.advance(Duration.ofHours(25));
tester.runMaintainers();
- assertEquals(9, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
index faf3f60e8af..03ab18c15d9 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
@@ -74,8 +74,6 @@ import static org.junit.Assert.assertTrue;
*/
public class ProvisioningTester {
- public static final ApplicationId tenantHostApp = ApplicationId.from("hosted-vespa", "tenant-host", "default");
-
private final Curator curator;
private final NodeFlavors nodeFlavors;
private final ManualClock clock;
@@ -545,7 +543,7 @@ public class ProvisioningTester {
}
public void activateTenantHosts() {
- prepareAndActivateInfraApplication(tenantHostApp, NodeType.host);
+ prepareAndActivateInfraApplication(applicationId(), NodeType.host);
}
public static ClusterSpec containerClusterSpec() {