diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-02-28 14:02:48 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-02-28 14:02:48 +0100 |
commit | 460933f7f8a63f0997c9160ca319c4af60073cad (patch) | |
tree | 4e82440ea1771956a8d5c1a4221e3e82321ef8ca /node-repository | |
parent | 81fb002d463b2de96a10cf6cfd7a5f44c3088118 (diff) |
Use the type of the node report
Preserve the type of the node reports.
Use the type to make decisions in NodeFailer and FailedExpirer.
Diffstat (limited to 'node-repository')
3 files changed, 60 insertions, 34 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 8db1e854bbb..c5e727b23da 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -35,7 +35,6 @@ import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; -import java.util.stream.Stream; import static java.util.stream.Collectors.collectingAndThen; import static java.util.stream.Collectors.counting; @@ -170,7 +169,7 @@ public class NodeFailer extends Maintainer { nodesByFailureReason.put(node, "Node has hardware divergence"); } else { Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); - List<String> failureReports = reasonsToRetireActiveParentHost(hostNode); + List<String> failureReports = reasonsToFailParentHost(hostNode); if (failureReports.size() > 0) { if (hostNode.equals(node)) { nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); @@ -216,7 +215,7 @@ public class NodeFailer extends Maintainer { } else { Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); if (hostNode.type() == NodeType.host) { - List<String> failureReports = reasonsToRetireActiveParentHost(hostNode); + List<String> failureReports = reasonsToFailParentHost(hostNode); if (failureReports.size() > 0) { if (hostNode.equals(node)) { nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); @@ -231,26 +230,14 @@ public class NodeFailer extends Maintainer { return nodesByFailureReason; } - private static List<String> reasonsToRetireActiveParentHost(Node hostNode) { - return Stream.of( - "badMicrocode", - "badTotalMemorySize", - "badTotalDiskSize", - "badDiskType", - "badInterfaceSpeed", - "badCpuCount" - ) - .map(reportId -> baseReportToString(hostNode, reportId)) - .flatMap(Optional::stream) + private static List<String> reasonsToFailParentHost(Node hostNode) { + return hostNode.reports().getReports().stream() + .filter(report -> report.getType().hostShouldBeFailed()) + // The generated string is built from the report's ID, created time, and description only. + .map(report -> report.getReportId() + " reported " + report.getCreatedTime() + ": " + report.getDescription()) .collect(Collectors.toList()); } - /** The generated string is built from the report's ID, created time, and description only. */ - static Optional<String> baseReportToString(Node node, String reportId) { - return node.reports().getReport(reportId).map(report -> - reportId + " reported " + report.getCreatedTime() + ": " + report.getDescription()); - } - /** Returns whether node has any kind of hardware issue */ public static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) { if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) { @@ -258,7 +245,7 @@ public class NodeFailer extends Maintainer { } Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.getNode(parent)).orElse(node); - return reasonsToRetireActiveParentHost(hostNode).size() > 0; + return reasonsToFailParentHost(hostNode).size() > 0; } private boolean expectConfigRequests(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java index 1e4c4503b16..af32530a156 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java @@ -7,6 +7,7 @@ import com.yahoo.slime.Slime; import com.yahoo.vespa.config.SlimeUtils; import java.time.Instant; +import java.util.Arrays; /** * A {@code Report} contains information about a node, typically found and published by host admin. @@ -18,10 +19,13 @@ import java.time.Instant; public class Report { /** The time the report was created, in milliseconds since Epoch. */ public static final String CREATED_FIELD = "createdMillis"; - /** The description of the error (implies wanting to fail out node). */ + /** The type of the report. */ + public static final String TYPE_FIELD = "type"; + /** The description of the report. */ public static final String DESCRIPTION_FIELD = "description"; private final String reportId; + private final Type type; private final Instant createdTime; private final String description; @@ -29,17 +33,46 @@ public class Report { // clients of specific reports to query the details. private final Inspector reportInspector; - private Report(String reportId, Instant createdTime, String description, Inspector reportInspector) { + public enum Type { + /** The default type if none given, or not recognized. */ + UNSPECIFIED(false), + /** The host has a soft failure and should be parked for manual inspection. */ + SOFT_FAIL(true), + /** The host has a hard failure and should be given back to siteops. */ + HARD_FAIL(true); + + private final boolean badHost; + + /** + * @param badHost Whether the host is actively trying to suspend itself and all children in anticipation + * of being failed out (by the {@code NodeFailer}. Will expire from failed to parked. + */ + Type(boolean badHost) { + this.badHost = badHost; + } + + public boolean hostShouldBeFailed() { + return badHost; + } + + public boolean shouldExpireToParked() { + return badHost; + } + } + + private Report(String reportId, Type type, Instant createdTime, String description, Inspector reportInspector) { this.reportId = reportId; + this.type = type; this.createdTime = createdTime; this.description = description; this.reportInspector = reportInspector; } /** The ID of the report. */ - public String getReportId() { - return reportId; - } + public String getReportId() { return reportId; } + + /** The type of the report. */ + public Type getType() { return type; } /** The time the report was created. */ public Instant getCreatedTime() { return createdTime; } @@ -51,17 +84,21 @@ public class Report { public String getDescription() { return description; } /** For exploring the JSON (Slime) of the report. */ - public Inspector getInspector() { - return reportInspector; - } + public Inspector getInspector() { return reportInspector; } /** Create the simplest possible report. */ - public static Report basicReport(String reportId, Instant createdTime, String description) { - return new Report(reportId, createdTime, description, new Slime().setObject()); + public static Report basicReport(String reportId, Type type, Instant createdTime, String description) { + return new Report(reportId, type, createdTime, description, new Slime().setObject()); } /** The reportInspector will be used to serialize the full report later, including any createdTime and description. */ public static Report fromSlime(String reportId, Inspector reportInspector) { + String typeString = reportInspector.field(TYPE_FIELD).asString(); + Type type = Arrays.stream(Type.values()) + .filter(t -> t.name().equalsIgnoreCase(typeString)) + .findFirst() + .orElse(Type.UNSPECIFIED); + long millisSinceEpoch = reportInspector.field(CREATED_FIELD).asLong(); if (millisSinceEpoch <= 0) { // Including null or not set. @@ -71,7 +108,7 @@ public class Report { String description = reportInspector.field(DESCRIPTION_FIELD).asString(); - return new Report(reportId, createdTime, description, reportInspector); + return new Report(reportId, type, createdTime, description, reportInspector); } public void toSlime(Cursor reportCursor) { @@ -80,6 +117,7 @@ public class Report { // In Slime, trying to overwrite an already existing field is a no-op. // We'll write the required fields now. If they weren't already set by the above copyObject, // in particular the created field, the below will be set it to the current timestamp which is what we want. + if (type != Type.UNSPECIFIED) reportCursor.setString(TYPE_FIELD, type.name()); reportCursor.setLong(CREATED_FIELD, createdTime.toEpochMilli()); if (!description.isEmpty()) reportCursor.setString(DESCRIPTION_FIELD, description); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 4aa9fb2b11d..965ce990c64 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -25,6 +25,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; +import static com.yahoo.vespa.hosted.provision.node.Report.Type.HARD_FAIL; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -62,7 +63,7 @@ public class NodeFailerTest { String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); // Set failure report to the parent and all its children. - Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low"); + Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low"); tester.nodeRepository.getNodes().stream() .filter(node -> node.hostname().equals(hostWithFailureReports)) .forEach(node -> { @@ -133,7 +134,7 @@ public class NodeFailerTest { String readyChild = hostnamesByState.get(Node.State.ready).get(0); // Set failure report to the parent and all its children. - Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low"); + Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low"); tester.nodeRepository.getNodes().stream() .filter(node -> node.hostname().equals(hostWithFailureReports)) .forEach(node -> { |