diff options
author | Valerij Fredriksen <valerijf@verizonmedia.com> | 2019-09-19 16:21:54 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerijf@verizonmedia.com> | 2019-09-19 16:27:56 +0200 |
commit | bdd780051258a53a15fb0c4b7b9d664e9972f48c (patch) | |
tree | 8acdcd0c95bf83c56f545234a381c33f9648d98c /node-repository | |
parent | ff8878cb658ead4175dcd53c524169e20c256364 (diff) |
Remove hardwareFailure and hardwareDivergence from node-repo maintainers
Diffstat (limited to 'node-repository')
5 files changed, 25 insertions, 80 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 0505dc754ff..c2f815b9d5d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -122,8 +122,7 @@ public class MetricsReporter extends Maintainer { metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context); metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context); - metric.set("hardwareFailure", node.status().hardwareFailureDescription().isPresent() ? 1 : 0, context); - metric.set("hardwareDivergence", node.status().hardwareDivergence().isPresent() ? 1 : 0, context); + metric.set("failReport", NodeFailer.reasonsToFailParentHost(node).isEmpty() ? 0 : 1, context); orchestrator.apply(new HostName(node.hostname())) .map(status -> status == HostStatus.ALLOWED_TO_BE_DOWN ? 1 : 0) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index ad147781841..67e0800a3f0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -164,10 +164,6 @@ public class NodeFailer extends Maintainer { for (Node node : nodeRepository().getNodes(Node.State.ready)) { if (expectConfigRequests(node) && ! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) { nodesByFailureReason.put(node, "Not receiving config requests from node"); - } else if (node.status().hardwareFailureDescription().isPresent()) { - nodesByFailureReason.put(node, "Node has hardware failure"); - } else if (node.status().hardwareDivergence().isPresent()) { - nodesByFailureReason.put(node, "Node has hardware divergence"); } else { Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); List<String> failureReports = reasonsToFailParentHost(hostNode); @@ -211,18 +207,14 @@ public class NodeFailer extends Maintainer { if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); } else if (hostSuspended(node, activeNodes)) { - if (node.status().hardwareFailureDescription().isPresent()) { - nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get()); - } else { - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); - if (hostNode.type().isDockerHost()) { - List<String> failureReports = reasonsToFailParentHost(hostNode); - if (failureReports.size() > 0) { - if (hostNode.equals(node)) { - nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); - } else { - nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports); - } + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); + if (hostNode.type().isDockerHost()) { + List<String> failureReports = reasonsToFailParentHost(hostNode); + if (failureReports.size() > 0) { + if (hostNode.equals(node)) { + nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); + } else { + nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports); } } } @@ -241,10 +233,6 @@ public class NodeFailer extends Maintainer { /** Returns whether node has any kind of hardware issue */ static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) { - if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) { - return true; - } - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.getNode(parent)).orElse(node); return reasonsToFailParentHost(hostNode).size() > 0; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java index 2b6bbdc6963..8509722b016 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java @@ -26,6 +26,8 @@ import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.Report; +import com.yahoo.vespa.hosted.provision.node.Reports; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; @@ -33,6 +35,7 @@ import com.yahoo.vespa.hosted.provision.testutils.MockProvisionServiceProvider; import org.junit.Test; import java.time.Duration; +import java.time.Instant; import java.util.List; import java.util.Optional; import java.util.Set; @@ -155,30 +158,7 @@ public class FailedExpirerTest { } @Test - public void ensure_failed_docker_nodes_are_deallocated() { - FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) - .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1") - .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2") - .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent3") - .setReady("parent1", "parent2", "parent3") - .allocate(tenantHostApplicationId, tenantHostApplicationClusterSpec, tenantHostApplicationCapacity) - .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "parent1") - .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "parent2") - .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node3", "parent3") - .setReady("node1", "node2", "node3") - .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3") - .failNode(4, "node1") - .failWithHardwareFailure("node2", "node3"); - - scenario.clock().advance(Duration.ofDays(5)); - scenario.expirer().run(); - - scenario.assertNodesIn(Node.State.parked, "node2", "node3"); - scenario.assertNodesIn(Node.State.dirty, "node1"); - } - - @Test - public void ensure_parked_docker_host() { + public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() { FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1") .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2") @@ -191,17 +171,16 @@ public class FailedExpirerTest { .setReady("node1", "node2", "node3") .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3") .failNode(8, "node3") - .failWithHardwareFailure("node2", "node3") .failWithHardwareFailure("parent2"); scenario.clock.advance(Duration.ofDays(5)); - scenario.expirer().run(); // Run twice because parent can only be parked after the child scenario.expirer().run(); - scenario.assertNodesIn(Node.State.parked, "parent2", "node2", "node3"); + scenario.assertNodesIn(Node.State.parked); + scenario.assertNodesIn(Node.State.failed, "parent2"); // Not parked because child (node2) isn't } @Test - public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() { + public void ensure_parked_docker_host() { FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1") .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2") @@ -224,8 +203,6 @@ public class FailedExpirerTest { .failWithHardwareFailure("parent3"); scenario.clock().advance(Duration.ofDays(3)); - - scenario.expirer().run(); // Run twice because parent can only be parked after the child scenario.expirer().run(); scenario.assertNodesIn(Node.State.failed, "parent3", "node3", "node4"); @@ -325,8 +302,8 @@ public class FailedExpirerTest { public FailureScenario failWithHardwareFailure(String... hostname) { Stream.of(hostname).forEach(h -> { Node node = get(h); - nodeRepository.write(node.with(node.status().withHardwareFailureDescription( - Optional.of("memory_mcelog"))), () -> {}); + Report report = Report.basicReport("reportId", Report.Type.HARD_FAIL, Instant.EPOCH, "hardware failure"); + nodeRepository.write(node.with(new Reports().withReport(report)), () -> {}); nodeRepository.fail(h, Agent.system, "Failed by unit test"); }); return this; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index 7c30c905983..c5af068c30b 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -81,8 +81,7 @@ public class MetricsReporterTest { expectedMetrics.put("wantToReboot", 0); expectedMetrics.put("wantToRetire", 0); expectedMetrics.put("wantToDeprovision", 0); - expectedMetrics.put("hardwareFailure", 0); - expectedMetrics.put("hardwareDivergence", 0); + expectedMetrics.put("failReport", 0); expectedMetrics.put("allowedToBeDown", 0); expectedMetrics.put("numberOfServices", 0L); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 1cf5ce252d7..e2fd8a8721c 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -8,6 +8,7 @@ import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Report; +import com.yahoo.vespa.hosted.provision.node.Reports; import org.junit.Test; import java.time.Duration; @@ -17,7 +18,6 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -37,22 +37,8 @@ import static org.mockito.Mockito.when; */ public class NodeFailerTest { - @Test - public void fail_nodes_with_hardware_failure_if_allowed_to_be_down() { - NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6); - String hostWithHwFailure = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); - - // Set hardware failure to the parent and all its children - tester.nodeRepository.getNodes().stream() - .filter(node -> node.parentHostname().map(parent -> parent.equals(hostWithHwFailure)) - .orElse(node.hostname().equals(hostWithHwFailure))) - .forEach(node -> { - Node updatedNode = node.with(node.status().withHardwareFailureDescription(Optional.of("HW failure"))); - tester.nodeRepository.write(updatedNode, () -> {}); - }); - - testNodeFailingWith(tester, hostWithHwFailure); - } + private static final Report badTotalMemorySizeReport = Report.basicReport( + "badTotalMemorySize", HARD_FAIL, Instant.now(), "too low"); @Test public void fail_nodes_with_severe_reports_if_allowed_to_be_down() { @@ -60,7 +46,6 @@ public class NodeFailerTest { String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); // Set failure report to the parent and all its children. - Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low"); tester.nodeRepository.getNodes().stream() .filter(node -> node.hostname().equals(hostWithFailureReports)) .forEach(node -> { @@ -214,8 +199,8 @@ public class NodeFailerTest { // Hardware failures are detected on two ready nodes, which are then failed Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2); Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3); - tester.nodeRepository.write(readyFail1.with(readyFail1.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))), () -> {}); - tester.nodeRepository.write(readyFail2.with(readyFail2.status().withHardwareFailureDescription(Optional.of("disk_smart"))), () -> {}); + tester.nodeRepository.write(readyFail1.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); + tester.nodeRepository.write(readyFail2.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); tester.failer.run(); assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); @@ -510,14 +495,11 @@ public class NodeFailerTest { Node readyNode = tester.createReadyNodes(1).get(0); tester.failer.run(); - assertEquals(Node.State.ready, readyNode.state()); - tester.nodeRepository.write(readyNode.with(readyNode.status() - .withHardwareDivergence(Optional.of("{\"specVerificationReport\":{\"actualIpv6Connection\":false}}"))), () -> {}); + tester.nodeRepository.write(readyNode.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); tester.failer.run(); - assertEquals(1, tester.nodeRepository.getNodes(Node.State.failed).size()); } |