summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@verizonmedia.com>2019-09-19 16:21:54 +0200
committerValerij Fredriksen <valerijf@verizonmedia.com>2019-09-19 16:27:56 +0200
commitbdd780051258a53a15fb0c4b7b9d664e9972f48c (patch)
tree8acdcd0c95bf83c56f545234a381c33f9648d98c /node-repository
parentff8878cb658ead4175dcd53c524169e20c256364 (diff)
Remove hardwareFailure and hardwareDivergence from node-repo maintainers
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java28
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java41
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java3
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java30
5 files changed, 25 insertions, 80 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 0505dc754ff..c2f815b9d5d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -122,8 +122,7 @@ public class MetricsReporter extends Maintainer {
metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context);
metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context);
- metric.set("hardwareFailure", node.status().hardwareFailureDescription().isPresent() ? 1 : 0, context);
- metric.set("hardwareDivergence", node.status().hardwareDivergence().isPresent() ? 1 : 0, context);
+ metric.set("failReport", NodeFailer.reasonsToFailParentHost(node).isEmpty() ? 0 : 1, context);
orchestrator.apply(new HostName(node.hostname()))
.map(status -> status == HostStatus.ALLOWED_TO_BE_DOWN ? 1 : 0)
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index ad147781841..67e0800a3f0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -164,10 +164,6 @@ public class NodeFailer extends Maintainer {
for (Node node : nodeRepository().getNodes(Node.State.ready)) {
if (expectConfigRequests(node) && ! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) {
nodesByFailureReason.put(node, "Not receiving config requests from node");
- } else if (node.status().hardwareFailureDescription().isPresent()) {
- nodesByFailureReason.put(node, "Node has hardware failure");
- } else if (node.status().hardwareDivergence().isPresent()) {
- nodesByFailureReason.put(node, "Node has hardware divergence");
} else {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
List<String> failureReports = reasonsToFailParentHost(hostNode);
@@ -211,18 +207,14 @@ public class NodeFailer extends Maintainer {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
} else if (hostSuspended(node, activeNodes)) {
- if (node.status().hardwareFailureDescription().isPresent()) {
- nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get());
- } else {
- Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
- if (hostNode.type().isDockerHost()) {
- List<String> failureReports = reasonsToFailParentHost(hostNode);
- if (failureReports.size() > 0) {
- if (hostNode.equals(node)) {
- nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
- } else {
- nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports);
- }
+ Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
+ if (hostNode.type().isDockerHost()) {
+ List<String> failureReports = reasonsToFailParentHost(hostNode);
+ if (failureReports.size() > 0) {
+ if (hostNode.equals(node)) {
+ nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
+ } else {
+ nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports);
}
}
}
@@ -241,10 +233,6 @@ public class NodeFailer extends Maintainer {
/** Returns whether node has any kind of hardware issue */
static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) {
- if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) {
- return true;
- }
-
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.getNode(parent)).orElse(node);
return reasonsToFailParentHost(hostNode).size() > 0;
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
index 2b6bbdc6963..8509722b016 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
@@ -26,6 +26,8 @@ import com.yahoo.vespa.flags.InMemoryFlagSource;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
+import com.yahoo.vespa.hosted.provision.node.Report;
+import com.yahoo.vespa.hosted.provision.node.Reports;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder;
import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner;
import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver;
@@ -33,6 +35,7 @@ import com.yahoo.vespa.hosted.provision.testutils.MockProvisionServiceProvider;
import org.junit.Test;
import java.time.Duration;
+import java.time.Instant;
import java.util.List;
import java.util.Optional;
import java.util.Set;
@@ -155,30 +158,7 @@ public class FailedExpirerTest {
}
@Test
- public void ensure_failed_docker_nodes_are_deallocated() {
- FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
- .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1")
- .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2")
- .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent3")
- .setReady("parent1", "parent2", "parent3")
- .allocate(tenantHostApplicationId, tenantHostApplicationClusterSpec, tenantHostApplicationCapacity)
- .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "parent1")
- .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "parent2")
- .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node3", "parent3")
- .setReady("node1", "node2", "node3")
- .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3")
- .failNode(4, "node1")
- .failWithHardwareFailure("node2", "node3");
-
- scenario.clock().advance(Duration.ofDays(5));
- scenario.expirer().run();
-
- scenario.assertNodesIn(Node.State.parked, "node2", "node3");
- scenario.assertNodesIn(Node.State.dirty, "node1");
- }
-
- @Test
- public void ensure_parked_docker_host() {
+ public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() {
FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
.withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1")
.withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2")
@@ -191,17 +171,16 @@ public class FailedExpirerTest {
.setReady("node1", "node2", "node3")
.allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3")
.failNode(8, "node3")
- .failWithHardwareFailure("node2", "node3")
.failWithHardwareFailure("parent2");
scenario.clock.advance(Duration.ofDays(5));
- scenario.expirer().run(); // Run twice because parent can only be parked after the child
scenario.expirer().run();
- scenario.assertNodesIn(Node.State.parked, "parent2", "node2", "node3");
+ scenario.assertNodesIn(Node.State.parked);
+ scenario.assertNodesIn(Node.State.failed, "parent2"); // Not parked because child (node2) isn't
}
@Test
- public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() {
+ public void ensure_parked_docker_host() {
FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
.withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1")
.withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2")
@@ -224,8 +203,6 @@ public class FailedExpirerTest {
.failWithHardwareFailure("parent3");
scenario.clock().advance(Duration.ofDays(3));
-
- scenario.expirer().run(); // Run twice because parent can only be parked after the child
scenario.expirer().run();
scenario.assertNodesIn(Node.State.failed, "parent3", "node3", "node4");
@@ -325,8 +302,8 @@ public class FailedExpirerTest {
public FailureScenario failWithHardwareFailure(String... hostname) {
Stream.of(hostname).forEach(h -> {
Node node = get(h);
- nodeRepository.write(node.with(node.status().withHardwareFailureDescription(
- Optional.of("memory_mcelog"))), () -> {});
+ Report report = Report.basicReport("reportId", Report.Type.HARD_FAIL, Instant.EPOCH, "hardware failure");
+ nodeRepository.write(node.with(new Reports().withReport(report)), () -> {});
nodeRepository.fail(h, Agent.system, "Failed by unit test");
});
return this;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
index 7c30c905983..c5af068c30b 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
@@ -81,8 +81,7 @@ public class MetricsReporterTest {
expectedMetrics.put("wantToReboot", 0);
expectedMetrics.put("wantToRetire", 0);
expectedMetrics.put("wantToDeprovision", 0);
- expectedMetrics.put("hardwareFailure", 0);
- expectedMetrics.put("hardwareDivergence", 0);
+ expectedMetrics.put("failReport", 0);
expectedMetrics.put("allowedToBeDown", 0);
expectedMetrics.put("numberOfServices", 0L);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 1cf5ce252d7..e2fd8a8721c 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -8,6 +8,7 @@ import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Report;
+import com.yahoo.vespa.hosted.provision.node.Reports;
import org.junit.Test;
import java.time.Duration;
@@ -17,7 +18,6 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
-import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -37,22 +37,8 @@ import static org.mockito.Mockito.when;
*/
public class NodeFailerTest {
- @Test
- public void fail_nodes_with_hardware_failure_if_allowed_to_be_down() {
- NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6);
- String hostWithHwFailure = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
-
- // Set hardware failure to the parent and all its children
- tester.nodeRepository.getNodes().stream()
- .filter(node -> node.parentHostname().map(parent -> parent.equals(hostWithHwFailure))
- .orElse(node.hostname().equals(hostWithHwFailure)))
- .forEach(node -> {
- Node updatedNode = node.with(node.status().withHardwareFailureDescription(Optional.of("HW failure")));
- tester.nodeRepository.write(updatedNode, () -> {});
- });
-
- testNodeFailingWith(tester, hostWithHwFailure);
- }
+ private static final Report badTotalMemorySizeReport = Report.basicReport(
+ "badTotalMemorySize", HARD_FAIL, Instant.now(), "too low");
@Test
public void fail_nodes_with_severe_reports_if_allowed_to_be_down() {
@@ -60,7 +46,6 @@ public class NodeFailerTest {
String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
// Set failure report to the parent and all its children.
- Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low");
tester.nodeRepository.getNodes().stream()
.filter(node -> node.hostname().equals(hostWithFailureReports))
.forEach(node -> {
@@ -214,8 +199,8 @@ public class NodeFailerTest {
// Hardware failures are detected on two ready nodes, which are then failed
Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2);
Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3);
- tester.nodeRepository.write(readyFail1.with(readyFail1.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))), () -> {});
- tester.nodeRepository.write(readyFail2.with(readyFail2.status().withHardwareFailureDescription(Optional.of("disk_smart"))), () -> {});
+ tester.nodeRepository.write(readyFail1.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
+ tester.nodeRepository.write(readyFail2.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
tester.failer.run();
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
@@ -510,14 +495,11 @@ public class NodeFailerTest {
Node readyNode = tester.createReadyNodes(1).get(0);
tester.failer.run();
-
assertEquals(Node.State.ready, readyNode.state());
- tester.nodeRepository.write(readyNode.with(readyNode.status()
- .withHardwareDivergence(Optional.of("{\"specVerificationReport\":{\"actualIpv6Connection\":false}}"))), () -> {});
+ tester.nodeRepository.write(readyNode.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {});
tester.failer.run();
-
assertEquals(1, tester.nodeRepository.getNodes(Node.State.failed).size());
}