aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2019-02-28 14:02:48 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2019-02-28 14:02:48 +0100
commit460933f7f8a63f0997c9160ca319c4af60073cad (patch)
tree4e82440ea1771956a8d5c1a4221e3e82321ef8ca /node-repository
parent81fb002d463b2de96a10cf6cfd7a5f44c3088118 (diff)
Use the type of the node report
Preserve the type of the node reports. Use the type to make decisions in NodeFailer and FailedExpirer.
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java29
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java60
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java5
3 files changed, 60 insertions, 34 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 8db1e854bbb..c5e727b23da 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -35,7 +35,6 @@ import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
-import java.util.stream.Stream;
import static java.util.stream.Collectors.collectingAndThen;
import static java.util.stream.Collectors.counting;
@@ -170,7 +169,7 @@ public class NodeFailer extends Maintainer {
nodesByFailureReason.put(node, "Node has hardware divergence");
} else {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
- List<String> failureReports = reasonsToRetireActiveParentHost(hostNode);
+ List<String> failureReports = reasonsToFailParentHost(hostNode);
if (failureReports.size() > 0) {
if (hostNode.equals(node)) {
nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
@@ -216,7 +215,7 @@ public class NodeFailer extends Maintainer {
} else {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
if (hostNode.type() == NodeType.host) {
- List<String> failureReports = reasonsToRetireActiveParentHost(hostNode);
+ List<String> failureReports = reasonsToFailParentHost(hostNode);
if (failureReports.size() > 0) {
if (hostNode.equals(node)) {
nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
@@ -231,26 +230,14 @@ public class NodeFailer extends Maintainer {
return nodesByFailureReason;
}
- private static List<String> reasonsToRetireActiveParentHost(Node hostNode) {
- return Stream.of(
- "badMicrocode",
- "badTotalMemorySize",
- "badTotalDiskSize",
- "badDiskType",
- "badInterfaceSpeed",
- "badCpuCount"
- )
- .map(reportId -> baseReportToString(hostNode, reportId))
- .flatMap(Optional::stream)
+ private static List<String> reasonsToFailParentHost(Node hostNode) {
+ return hostNode.reports().getReports().stream()
+ .filter(report -> report.getType().hostShouldBeFailed())
+ // The generated string is built from the report's ID, created time, and description only.
+ .map(report -> report.getReportId() + " reported " + report.getCreatedTime() + ": " + report.getDescription())
.collect(Collectors.toList());
}
- /** The generated string is built from the report's ID, created time, and description only. */
- static Optional<String> baseReportToString(Node node, String reportId) {
- return node.reports().getReport(reportId).map(report ->
- reportId + " reported " + report.getCreatedTime() + ": " + report.getDescription());
- }
-
/** Returns whether node has any kind of hardware issue */
public static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) {
if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) {
@@ -258,7 +245,7 @@ public class NodeFailer extends Maintainer {
}
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.getNode(parent)).orElse(node);
- return reasonsToRetireActiveParentHost(hostNode).size() > 0;
+ return reasonsToFailParentHost(hostNode).size() > 0;
}
private boolean expectConfigRequests(Node node) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java
index 1e4c4503b16..af32530a156 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java
@@ -7,6 +7,7 @@ import com.yahoo.slime.Slime;
import com.yahoo.vespa.config.SlimeUtils;
import java.time.Instant;
+import java.util.Arrays;
/**
* A {@code Report} contains information about a node, typically found and published by host admin.
@@ -18,10 +19,13 @@ import java.time.Instant;
public class Report {
/** The time the report was created, in milliseconds since Epoch. */
public static final String CREATED_FIELD = "createdMillis";
- /** The description of the error (implies wanting to fail out node). */
+ /** The type of the report. */
+ public static final String TYPE_FIELD = "type";
+ /** The description of the report. */
public static final String DESCRIPTION_FIELD = "description";
private final String reportId;
+ private final Type type;
private final Instant createdTime;
private final String description;
@@ -29,17 +33,46 @@ public class Report {
// clients of specific reports to query the details.
private final Inspector reportInspector;
- private Report(String reportId, Instant createdTime, String description, Inspector reportInspector) {
+ public enum Type {
+ /** The default type if none given, or not recognized. */
+ UNSPECIFIED(false),
+ /** The host has a soft failure and should be parked for manual inspection. */
+ SOFT_FAIL(true),
+ /** The host has a hard failure and should be given back to siteops. */
+ HARD_FAIL(true);
+
+ private final boolean badHost;
+
+ /**
+ * @param badHost Whether the host is actively trying to suspend itself and all children in anticipation
+ * of being failed out (by the {@code NodeFailer}. Will expire from failed to parked.
+ */
+ Type(boolean badHost) {
+ this.badHost = badHost;
+ }
+
+ public boolean hostShouldBeFailed() {
+ return badHost;
+ }
+
+ public boolean shouldExpireToParked() {
+ return badHost;
+ }
+ }
+
+ private Report(String reportId, Type type, Instant createdTime, String description, Inspector reportInspector) {
this.reportId = reportId;
+ this.type = type;
this.createdTime = createdTime;
this.description = description;
this.reportInspector = reportInspector;
}
/** The ID of the report. */
- public String getReportId() {
- return reportId;
- }
+ public String getReportId() { return reportId; }
+
+ /** The type of the report. */
+ public Type getType() { return type; }
/** The time the report was created. */
public Instant getCreatedTime() { return createdTime; }
@@ -51,17 +84,21 @@ public class Report {
public String getDescription() { return description; }
/** For exploring the JSON (Slime) of the report. */
- public Inspector getInspector() {
- return reportInspector;
- }
+ public Inspector getInspector() { return reportInspector; }
/** Create the simplest possible report. */
- public static Report basicReport(String reportId, Instant createdTime, String description) {
- return new Report(reportId, createdTime, description, new Slime().setObject());
+ public static Report basicReport(String reportId, Type type, Instant createdTime, String description) {
+ return new Report(reportId, type, createdTime, description, new Slime().setObject());
}
/** The reportInspector will be used to serialize the full report later, including any createdTime and description. */
public static Report fromSlime(String reportId, Inspector reportInspector) {
+ String typeString = reportInspector.field(TYPE_FIELD).asString();
+ Type type = Arrays.stream(Type.values())
+ .filter(t -> t.name().equalsIgnoreCase(typeString))
+ .findFirst()
+ .orElse(Type.UNSPECIFIED);
+
long millisSinceEpoch = reportInspector.field(CREATED_FIELD).asLong();
if (millisSinceEpoch <= 0) {
// Including null or not set.
@@ -71,7 +108,7 @@ public class Report {
String description = reportInspector.field(DESCRIPTION_FIELD).asString();
- return new Report(reportId, createdTime, description, reportInspector);
+ return new Report(reportId, type, createdTime, description, reportInspector);
}
public void toSlime(Cursor reportCursor) {
@@ -80,6 +117,7 @@ public class Report {
// In Slime, trying to overwrite an already existing field is a no-op.
// We'll write the required fields now. If they weren't already set by the above copyObject,
// in particular the created field, the below will be set it to the current timestamp which is what we want.
+ if (type != Type.UNSPECIFIED) reportCursor.setString(TYPE_FIELD, type.name());
reportCursor.setLong(CREATED_FIELD, createdTime.toEpochMilli());
if (!description.isEmpty()) reportCursor.setString(DESCRIPTION_FIELD, description);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 4aa9fb2b11d..965ce990c64 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -25,6 +25,7 @@ import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import static com.yahoo.vespa.hosted.provision.node.Report.Type.HARD_FAIL;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -62,7 +63,7 @@ public class NodeFailerTest {
String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
// Set failure report to the parent and all its children.
- Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low");
+ Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low");
tester.nodeRepository.getNodes().stream()
.filter(node -> node.hostname().equals(hostWithFailureReports))
.forEach(node -> {
@@ -133,7 +134,7 @@ public class NodeFailerTest {
String readyChild = hostnamesByState.get(Node.State.ready).get(0);
// Set failure report to the parent and all its children.
- Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low");
+ Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low");
tester.nodeRepository.getNodes().stream()
.filter(node -> node.hostname().equals(hostWithFailureReports))
.forEach(node -> {