diff options
author | Martin Polden <mpolden@mpolden.no> | 2017-10-19 22:20:41 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-19 22:20:41 +0200 |
commit | 70109980a8c3aebdd317d76e203c028e69365660 (patch) | |
tree | ad0e3369dff7f38bbdb62706d37ad2cf32a6e7b2 /node-repository | |
parent | b2bca50263aa1a7dbd3c30b932296507bce0db7a (diff) | |
parent | 1295cb17b5f3269ac4ab6e6469fe79f751556208 (diff) |
Merge pull request #3807 from vespa-engine/andreer/fail-divergent-ready-nodes
fail divergent ready nodes
Diffstat (limited to 'node-repository')
6 files changed, 51 insertions, 5 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index b7971e61117..1e0202d4735 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -57,7 +57,7 @@ public class FailedExpirer extends Expirer { protected void expire(List<Node> expired) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node recycleCandidate : expired) { - if (recycleCandidate.status().hardwareFailureDescription().isPresent()) { + if (recycleCandidate.status().hardwareFailureDescription().isPresent() || recycleCandidate.status().hardwareDivergence().isPresent()) { List<String> nonParkedChildren = recycleCandidate.type() != NodeType.host ? Collections.emptyList() : nodeRepository.getChildNodes(recycleCandidate.hostname()).stream() .filter(node -> node.state() != Node.State.parked) @@ -65,9 +65,9 @@ public class FailedExpirer extends Expirer { .collect(Collectors.toList()); if (nonParkedChildren.isEmpty()) { - nodeRepository.park(recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure on node"); + nodeRepository.park(recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure/divergence on node"); } else { - log.info(String.format("Expired failed node %s with HW fail is not parked because some of its children" + + log.info(String.format("Expired failed node %s with HW failure/divergence is not parked because some of its children" + " (%s) are not yet parked", recycleCandidate.hostname(), String.join(", ", nonParkedChildren))); } } else if (! failCountIndicatesHwFail(zone, recycleCandidate) || recycleCandidate.status().failCount() < 5) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 63bd8f1b424..d90b558a6eb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -88,6 +88,10 @@ public class NodeFailer extends Maintainer { if ( ! throttle(node)) nodeRepository().fail(node.hostname(), Agent.system, "Node has hardware failure"); + for (Node node: readyNodesWithHardwareDivergence()) + if ( ! throttle(node)) nodeRepository().fail(node.hostname(), + Agent.system, "Node hardware diverges from spec"); + // Active nodes for (Node node : determineActiveNodeDownStatus()) { Instant graceTimeEnd = node.history().event(History.Event.Type.down).get().at().plus(downTimeLimit); @@ -130,6 +134,12 @@ public class NodeFailer extends Maintainer { .collect(Collectors.toList()); } + private List<Node> readyNodesWithHardwareDivergence() { + return nodeRepository().getNodes(Node.State.ready).stream() + .filter(node -> node.status().hardwareDivergence().isPresent()) + .collect(Collectors.toList()); + } + private boolean wasMadeReadyBefore(Instant instant, Node node) { Optional<History.Event> readiedEvent = node.history().event(History.Event.Type.readied); if ( ! readiedEvent.isPresent()) return false; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 9393dc5ead4..dc26303d804 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -171,7 +171,7 @@ public class NodeSerializer { hardwareFailureDescriptionFromSlime(object), object.field(wantToRetireKey).asBool(), wantToDeprovision, - hardwareDivergenceFromSlime(object)); + removeQuotedNulls(hardwareDivergenceFromSlime(object))); } private Flavor flavorFromSlime(Inspector object) { @@ -239,6 +239,12 @@ public class NodeSerializer { return Optional.empty(); } + // Remove when we no longer have "null" strings for this field in the node repo + private Optional<String> removeQuotedNulls(Optional<String> value) { + return value.filter(v -> !v.equals("null")); + } + + private Set<String> ipAddressesFromSlime(Inspector object, String key) { ImmutableSet.Builder<String> ipAddresses = ImmutableSet.builder(); object.field(key).traverse((ArrayTraverser) (i, item) -> ipAddresses.add(item.asString())); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java index d146e976121..62bfea21eff 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java @@ -125,7 +125,7 @@ public class NodePatcher { case "wantToDeprovision" : return node.with(node.status().withWantToDeprovision(asBoolean(value))); case "hardwareDivergence" : - return node.with(node.status().withHardwareDivergence(asOptionalString(value))); + return node.with(node.status().withHardwareDivergence(removeQuotedNulls(asOptionalString(value)))); default : throw new IllegalArgumentException("Could not apply field '" + name + "' on a node: No such modifiable field"); } @@ -170,6 +170,11 @@ public class NodePatcher { return field.type().equals(Type.NIX) ? Optional.empty() : Optional.of(asString(field)); } + // Remove when we no longer have "null" strings for this field in the node repo + private Optional<String> removeQuotedNulls(Optional<String> value) { + return value.filter(v -> !v.equals("null")); + } + private boolean asBoolean(Inspector field) { if ( ! field.type().equals(Type.BOOL)) throw new IllegalArgumentException("Expected a BOOL value, got a " + field.type()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 8fd67f949d9..c78663415ef 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -353,6 +353,24 @@ public class NodeFailerTest { } @Test + public void failing_divergent_ready_nodes() { + NodeFailTester tester = NodeFailTester.withNoApplications(); + + Node readyNode = tester.createReadyNodes(1).get(0); + + tester.failer.run(); + + assertEquals(Node.State.ready, readyNode.state()); + + tester.nodeRepository.write(readyNode.with(readyNode.status() + .withHardwareDivergence(Optional.of("{\"specVerificationReport\":{\"actualIpv6Connection\":false}}")))); + + tester.failer.run(); + + assertEquals(1, tester.nodeRepository.getNodes(Node.State.failed).size()); + } + + @Test public void node_failing_throttle() { // Throttles based on a absolute number in small zone { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java index 5f81013d8e1..f42ba220fdf 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java @@ -496,6 +496,13 @@ public class RestApiTest { Request.Method.PATCH), "{\"message\":\"Updated host6.yahoo.com\"}"); assertFile(new Request("http://localhost:8080/nodes/v2/node/host6.yahoo.com"), "node6.json"); + + // Clear on quoted "null" report + assertResponse(new Request("http://localhost:8080/nodes/v2/node/host6.yahoo.com", + Utf8.toBytes("{\"hardwareDivergence\": \"null\"}"), + Request.Method.PATCH), + "{\"message\":\"Updated host6.yahoo.com\"}"); + assertFile(new Request("http://localhost:8080/nodes/v2/node/host6.yahoo.com"), "node6.json"); } /** Tests the rendering of each node separately to make it easier to find errors */ |