diff options
author | Martin Polden <mpolden@mpolden.no> | 2017-10-19 22:20:41 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-19 22:20:41 +0200 |
commit | 70109980a8c3aebdd317d76e203c028e69365660 (patch) | |
tree | ad0e3369dff7f38bbdb62706d37ad2cf32a6e7b2 /node-repository/src/main/java | |
parent | b2bca50263aa1a7dbd3c30b932296507bce0db7a (diff) | |
parent | 1295cb17b5f3269ac4ab6e6469fe79f751556208 (diff) |
Merge pull request #3807 from vespa-engine/andreer/fail-divergent-ready-nodes
fail divergent ready nodes
Diffstat (limited to 'node-repository/src/main/java')
4 files changed, 26 insertions, 5 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index b7971e61117..1e0202d4735 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -57,7 +57,7 @@ public class FailedExpirer extends Expirer { protected void expire(List<Node> expired) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node recycleCandidate : expired) { - if (recycleCandidate.status().hardwareFailureDescription().isPresent()) { + if (recycleCandidate.status().hardwareFailureDescription().isPresent() || recycleCandidate.status().hardwareDivergence().isPresent()) { List<String> nonParkedChildren = recycleCandidate.type() != NodeType.host ? Collections.emptyList() : nodeRepository.getChildNodes(recycleCandidate.hostname()).stream() .filter(node -> node.state() != Node.State.parked) @@ -65,9 +65,9 @@ public class FailedExpirer extends Expirer { .collect(Collectors.toList()); if (nonParkedChildren.isEmpty()) { - nodeRepository.park(recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure on node"); + nodeRepository.park(recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure/divergence on node"); } else { - log.info(String.format("Expired failed node %s with HW fail is not parked because some of its children" + + log.info(String.format("Expired failed node %s with HW failure/divergence is not parked because some of its children" + " (%s) are not yet parked", recycleCandidate.hostname(), String.join(", ", nonParkedChildren))); } } else if (! failCountIndicatesHwFail(zone, recycleCandidate) || recycleCandidate.status().failCount() < 5) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 63bd8f1b424..d90b558a6eb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -88,6 +88,10 @@ public class NodeFailer extends Maintainer { if ( ! throttle(node)) nodeRepository().fail(node.hostname(), Agent.system, "Node has hardware failure"); + for (Node node: readyNodesWithHardwareDivergence()) + if ( ! throttle(node)) nodeRepository().fail(node.hostname(), + Agent.system, "Node hardware diverges from spec"); + // Active nodes for (Node node : determineActiveNodeDownStatus()) { Instant graceTimeEnd = node.history().event(History.Event.Type.down).get().at().plus(downTimeLimit); @@ -130,6 +134,12 @@ public class NodeFailer extends Maintainer { .collect(Collectors.toList()); } + private List<Node> readyNodesWithHardwareDivergence() { + return nodeRepository().getNodes(Node.State.ready).stream() + .filter(node -> node.status().hardwareDivergence().isPresent()) + .collect(Collectors.toList()); + } + private boolean wasMadeReadyBefore(Instant instant, Node node) { Optional<History.Event> readiedEvent = node.history().event(History.Event.Type.readied); if ( ! readiedEvent.isPresent()) return false; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 9393dc5ead4..dc26303d804 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -171,7 +171,7 @@ public class NodeSerializer { hardwareFailureDescriptionFromSlime(object), object.field(wantToRetireKey).asBool(), wantToDeprovision, - hardwareDivergenceFromSlime(object)); + removeQuotedNulls(hardwareDivergenceFromSlime(object))); } private Flavor flavorFromSlime(Inspector object) { @@ -239,6 +239,12 @@ public class NodeSerializer { return Optional.empty(); } + // Remove when we no longer have "null" strings for this field in the node repo + private Optional<String> removeQuotedNulls(Optional<String> value) { + return value.filter(v -> !v.equals("null")); + } + + private Set<String> ipAddressesFromSlime(Inspector object, String key) { ImmutableSet.Builder<String> ipAddresses = ImmutableSet.builder(); object.field(key).traverse((ArrayTraverser) (i, item) -> ipAddresses.add(item.asString())); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java index d146e976121..62bfea21eff 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java @@ -125,7 +125,7 @@ public class NodePatcher { case "wantToDeprovision" : return node.with(node.status().withWantToDeprovision(asBoolean(value))); case "hardwareDivergence" : - return node.with(node.status().withHardwareDivergence(asOptionalString(value))); + return node.with(node.status().withHardwareDivergence(removeQuotedNulls(asOptionalString(value)))); default : throw new IllegalArgumentException("Could not apply field '" + name + "' on a node: No such modifiable field"); } @@ -170,6 +170,11 @@ public class NodePatcher { return field.type().equals(Type.NIX) ? Optional.empty() : Optional.of(asString(field)); } + // Remove when we no longer have "null" strings for this field in the node repo + private Optional<String> removeQuotedNulls(Optional<String> value) { + return value.filter(v -> !v.equals("null")); + } + private boolean asBoolean(Inspector field) { if ( ! field.type().equals(Type.BOOL)) throw new IllegalArgumentException("Expected a BOOL value, got a " + field.type()); |