aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2022-04-13 15:32:33 +0200
committerMartin Polden <mpolden@mpolden.no>2022-04-19 09:20:39 +0200
commit71bb691f00dfcc3a3e5211ca4ba47b12dadcf25f (patch)
treee07713228ff9039bc3694d0cdf17af0d153f8277 /node-repository/src/main/java/com/yahoo/vespa/hosted/provision
parentdd38333faaeaeeccaef5e08fbf40be6feb2a67b4 (diff)
Keep a chronological log of events per node
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java11
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java61
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java21
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java8
7 files changed, 83 insertions, 33 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index 3db68a27234..191c6d947ac 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -353,13 +353,16 @@ public final class Node implements Nodelike {
}
/** Returns a copy of this with any history record saying it has been detected down removed */
- public Node up() {
- return with(history.without(History.Event.Type.down));
+ public Node upAt(Instant instant, Agent agent) {
+ return with(history.with(new History.Event(History.Event.Type.up, agent, instant)));
}
- /** Returns whether this node has a record of being down */
+ /** Returns whether this node is down, according to its recorded 'down' and 'up' events */
public boolean isDown() {
- return history().event(History.Event.Type.down).isPresent();
+ Optional<Instant> downAt = history().event(History.Event.Type.down).map(History.Event::at);
+ if (downAt.isEmpty()) return false;
+
+ return !history().hasEventAfter(History.Event.Type.up, downAt.get());
}
/** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 1fe29c8b162..62557b275c8 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -20,7 +20,6 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.ClusterId;
-import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.hosted.provision.persistence.CacheStats;
import com.yahoo.vespa.service.monitor.ServiceModel;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
@@ -248,7 +247,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
boolean down = NodeHealthTracker.allDown(services);
metric.set("nodeFailerBadNode", (down ? 1 : 0), context);
- boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent();
+ boolean nodeDownInNodeRepo = node.isDown();
metric.set("downInNodeRepo", (nodeDownInNodeRepo ? 1 : 0), context);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 3900d10a53e..237cbaedf46 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -155,7 +155,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
for (Node node : activeNodes) {
Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
- if (node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
+ if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
// Allow a grace period after node re-activation
if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart))
failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
index 874ff91d8a4..b43e2ae051f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
@@ -96,7 +96,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
if (isDown) {
recordAsDown(node.get(), lock);
} else {
- clearDownRecord(node.get(), lock);
+ recordAsUp(node.get(), lock);
}
} catch (ApplicationLockException e) {
// Fine, carry on with other nodes. We'll try updating this one in the next run
@@ -129,14 +129,14 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
/** Record a node as down if not already recorded */
private void recordAsDown(Node node, Mutex lock) {
- if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp
+ if (node.isDown()) return; // already down: Don't change down timestamp
nodeRepository().nodes().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock);
}
/** Clear down record for node, if any */
- private void clearDownRecord(Node node, Mutex lock) {
- if (node.history().event(History.Event.Type.down).isEmpty()) return;
- nodeRepository().nodes().write(node.up(), lock);
+ private void recordAsUp(Node node, Mutex lock) {
+ if (!node.isDown()) return; // already up: Don't change down timestamp
+ nodeRepository().nodes().write(node.upAt(clock().instant(), Agent.NodeHealthTracker), lock);
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
index f1e62634235..ac804f99cd3 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
@@ -5,13 +5,17 @@ import com.google.common.collect.ImmutableMap;
import com.yahoo.vespa.hosted.provision.Node;
import java.time.Instant;
+import java.util.ArrayList;
import java.util.Collection;
-import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
/**
- * An immutable record of the last event of each type happening to this node.
+ * An immutable record of the last event of each type happening to this node, and a chronological log of the events.
+ *
* Note that the history cannot be used to find the nodes current state - it will have a record of some
* event happening in the past even if that event is later undone.
*
@@ -19,14 +23,24 @@ import java.util.stream.Collectors;
*/
public class History {
+ private static final int MAX_LOG_SIZE = 10;
+
private final ImmutableMap<Event.Type, Event> events;
+ private final List<Event> log;
+ private final int maxLogSize;
- public History(Collection<Event> events) {
- this(toImmutableMap(events));
+ public History(Collection<Event> events, List<Event> log) {
+ this(toImmutableMap(events), log, MAX_LOG_SIZE);
}
- private History(ImmutableMap<Event.Type, Event> events) {
+ History(ImmutableMap<Event.Type, Event> events, List<Event> log, int maxLogSize) {
this.events = events;
+ this.log = Objects.requireNonNull(log, "log must be non-null")
+ .stream()
+ .sorted(Comparator.comparing(Event::at))
+ .skip(Math.max(log.size() - maxLogSize, 0))
+ .collect(Collectors.toUnmodifiableList());
+ this.maxLogSize = maxLogSize;
}
private static ImmutableMap<Event.Type, Event> toImmutableMap(Collection<Event> events) {
@@ -36,7 +50,7 @@ public class History {
return builder.build();
}
- /** Returns this event if it is present in this history */
+ /** Returns the last event of given type, if it is present in this history */
public Optional<Event> event(Event.Type type) { return Optional.ofNullable(events.get(type)); }
/** Returns true if a given event is registered in this history at the given time */
@@ -60,18 +74,28 @@ public class History {
.orElse(false);
}
+ /** Returns the last event of each type in this history */
public Collection<Event> events() { return events.values(); }
+ /**
+ * Returns the events in this history, in chronological order. Compared to {@link #events()}, this holds all events
+ * as they occurred, up to log size limit
+ */
+ public List<Event> log() { return log; }
+
/** Returns a copy of this history with the given event added */
public History with(Event event) {
ImmutableMap.Builder<Event.Type, Event> builder = builderWithout(event.type());
builder.put(event.type(), event);
- return new History(builder.build());
+ List<Event> logCopy = new ArrayList<>(log);
+ logCopy.add(event);
+ return new History(builder.build(), logCopy, maxLogSize);
}
- /** Returns a copy of this history with the given event type removed (or an identical history if it was not present) */
+ /** Returns a copy of this history with the given event type removed (or an identical history if it was not
+ * present) and the log unchanged. */
public History without(Event.Type type) {
- return new History(builderWithout(type).build());
+ return new History(builderWithout(type).build(), log, maxLogSize);
}
private ImmutableMap.Builder<Event.Type, Event> builderWithout(Event.Type type) {
@@ -103,14 +127,14 @@ public class History {
/**
* Events can be application or node level.
- * This returns a copy of this history with all application level events removed.
+ * This returns a copy of this history with all application level events removed and the log unchanged.
*/
private History withoutApplicationEvents() {
- return new History(events().stream().filter(e -> ! e.type().isApplicationLevel()).collect(Collectors.toList()));
+ return new History(events().stream().filter(e -> ! e.type().isApplicationLevel()).collect(Collectors.toList()), log);
}
/** Returns the empty history */
- public static History empty() { return new History(Collections.emptyList()); }
+ public static History empty() { return new History(List.of(), List.of()); }
@Override
public String toString() {
@@ -198,6 +222,19 @@ public class History {
@Override
public String toString() { return "'" + type + "' event at " + at + " by " + agent; }
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ Event event = (Event) o;
+ return at.equals(event.at) && agent == event.agent && type == event.type;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(at, agent, type);
+ }
+
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
index 4990c1e9db8..7eea56c7c4e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
@@ -42,6 +42,7 @@ import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.Set;
@@ -75,6 +76,7 @@ public class NodeSerializer {
private static final String idKey = "openStackId";
private static final String parentHostnameKey = "parentHostname";
private static final String historyKey = "history";
+ private static final String logKey = "log";
private static final String instanceKey = "instance"; // legacy name, TODO: change to allocation with backwards compat
private static final String rebootGenerationKey = "rebootGeneration";
private static final String currentRebootGenerationKey = "currentRebootGeneration";
@@ -177,7 +179,8 @@ public class NodeSerializer {
object.setBool(wantToFailKey, node.status().wantToFail());
object.setBool(wantToRebuildKey, node.status().wantToRebuild());
node.allocation().ifPresent(allocation -> toSlime(allocation, object.setObject(instanceKey)));
- toSlime(node.history(), object.setArray(historyKey));
+ toSlime(node.history().events(), object.setArray(historyKey));
+ toSlime(node.history().log(), object.setArray(logKey));
object.setString(nodeTypeKey, toString(node.type()));
node.status().osVersion().current().ifPresent(version -> object.setString(osVersionKey, version.toString()));
node.status().osVersion().wanted().ifPresent(version -> object.setString(wantedOsVersionKey, version.toFullString()));
@@ -218,8 +221,8 @@ public class NodeSerializer {
allocation.networkPorts().ifPresent(ports -> NetworkPortsSerializer.toSlime(ports, object.setArray(networkPortsKey)));
}
- private void toSlime(History history, Cursor array) {
- for (History.Event event : history.events())
+ private void toSlime(Collection<History.Event> events, Cursor array) {
+ for (History.Event event : events)
toSlime(event, array.addObject());
}
@@ -277,7 +280,7 @@ public class NodeSerializer {
statusFromSlime(object),
state,
allocationFromSlime(flavor.resources(), object.field(instanceKey)),
- historyFromSlime(object.field(historyKey)),
+ historyFromSlime(object),
nodeTypeFromString(object.field(nodeTypeKey).asString()),
Reports.fromSlime(object.field(reportsKey)),
modelNameFromSlime(object),
@@ -338,14 +341,20 @@ public class NodeSerializer {
InstanceName.from(object.field(instanceIdKey).asString()));
}
- private History historyFromSlime(Inspector array) {
+ private History historyFromSlime(Inspector object) {
+ return new History(eventsFromSlime(object.field(historyKey)),
+ eventsFromSlime(object.field(logKey)));
+ }
+
+ private List<History.Event> eventsFromSlime(Inspector array) {
+ if (!array.valid()) return List.of();
List<History.Event> events = new ArrayList<>();
array.traverse((ArrayTraverser) (int i, Inspector item) -> {
History.Event event = eventFromSlime(item);
if (event != null)
events.add(event);
});
- return new History(events);
+ return events;
}
private History.Event eventFromSlime(Inspector object) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
index 922c8bc8e20..aa429522147 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
@@ -22,6 +22,7 @@ import com.yahoo.vespa.orchestrator.status.HostInfo;
import com.yahoo.vespa.orchestrator.status.HostStatus;
import java.net.URI;
+import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
@@ -171,7 +172,8 @@ class NodesResponse extends SlimeJsonResponse {
object.setBool("preferToRetire", node.status().preferToRetire());
object.setBool("wantToDeprovision", node.status().wantToDeprovision());
object.setBool("wantToRebuild", node.status().wantToRebuild());
- toSlime(node.history(), object.setArray("history"));
+ toSlime(node.history().events(), object.setArray("history"));
+ toSlime(node.history().log(), object.setArray("log"));
ipAddressesToSlime(node.ipConfig().primary(), object.setArray("ipAddresses"));
ipAddressesToSlime(node.ipConfig().pool().ipSet(), object.setArray("additionalIpAddresses"));
addressesToSlime(node.ipConfig().pool().getAddressList(), object);
@@ -196,8 +198,8 @@ class NodesResponse extends SlimeJsonResponse {
object.setBool("retired", membership.retired());
}
- private void toSlime(History history, Cursor array) {
- for (History.Event event : history.events()) {
+ private void toSlime(Collection<History.Event> events, Cursor array) {
+ for (History.Event event : events) {
Cursor object = array.addObject();
object.setString("event", event.type().name());
object.setLong("at", event.at().toEpochMilli());