diff options
author | Martin Polden <mpolden@mpolden.no> | 2022-04-13 15:32:33 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2022-04-19 09:20:39 +0200 |
commit | 71bb691f00dfcc3a3e5211ca4ba47b12dadcf25f (patch) | |
tree | e07713228ff9039bc3694d0cdf17af0d153f8277 /node-repository/src/main/java/com/yahoo/vespa/hosted/provision | |
parent | dd38333faaeaeeccaef5e08fbf40be6feb2a67b4 (diff) |
Keep a chronological log of events per node
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision')
7 files changed, 83 insertions, 33 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 3db68a27234..191c6d947ac 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -353,13 +353,16 @@ public final class Node implements Nodelike { } /** Returns a copy of this with any history record saying it has been detected down removed */ - public Node up() { - return with(history.without(History.Event.Type.down)); + public Node upAt(Instant instant, Agent agent) { + return with(history.with(new History.Event(History.Event.Type.up, agent, instant))); } - /** Returns whether this node has a record of being down */ + /** Returns whether this node is down, according to its recorded 'down' and 'up' events */ public boolean isDown() { - return history().event(History.Event.Type.down).isPresent(); + Optional<Instant> downAt = history().event(History.Event.Type.down).map(History.Event::at); + if (downAt.isEmpty()) return false; + + return !history().hasEventAfter(History.Event.Type.up, downAt.get()); } /** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 1fe29c8b162..62557b275c8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -20,7 +20,6 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.ClusterId; -import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.hosted.provision.persistence.CacheStats; import com.yahoo.vespa.service.monitor.ServiceModel; import com.yahoo.vespa.service.monitor.ServiceMonitor; @@ -248,7 +247,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { boolean down = NodeHealthTracker.allDown(services); metric.set("nodeFailerBadNode", (down ? 1 : 0), context); - boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent(); + boolean nodeDownInNodeRepo = node.isDown(); metric.set("downInNodeRepo", (nodeDownInNodeRepo ? 1 : 0), context); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 3900d10a53e..237cbaedf46 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -155,7 +155,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { for (Node node : activeNodes) { Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); - if (node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { + if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { // Allow a grace period after node re-activation if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index 874ff91d8a4..b43e2ae051f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -96,7 +96,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { if (isDown) { recordAsDown(node.get(), lock); } else { - clearDownRecord(node.get(), lock); + recordAsUp(node.get(), lock); } } catch (ApplicationLockException e) { // Fine, carry on with other nodes. We'll try updating this one in the next run @@ -129,14 +129,14 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { /** Record a node as down if not already recorded */ private void recordAsDown(Node node, Mutex lock) { - if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp + if (node.isDown()) return; // already down: Don't change down timestamp nodeRepository().nodes().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock); } /** Clear down record for node, if any */ - private void clearDownRecord(Node node, Mutex lock) { - if (node.history().event(History.Event.Type.down).isEmpty()) return; - nodeRepository().nodes().write(node.up(), lock); + private void recordAsUp(Node node, Mutex lock) { + if (!node.isDown()) return; // already up: Don't change down timestamp + nodeRepository().nodes().write(node.upAt(clock().instant(), Agent.NodeHealthTracker), lock); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index f1e62634235..ac804f99cd3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -5,13 +5,17 @@ import com.google.common.collect.ImmutableMap; import com.yahoo.vespa.hosted.provision.Node; import java.time.Instant; +import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; /** - * An immutable record of the last event of each type happening to this node. + * An immutable record of the last event of each type happening to this node, and a chronological log of the events. + * * Note that the history cannot be used to find the nodes current state - it will have a record of some * event happening in the past even if that event is later undone. * @@ -19,14 +23,24 @@ import java.util.stream.Collectors; */ public class History { + private static final int MAX_LOG_SIZE = 10; + private final ImmutableMap<Event.Type, Event> events; + private final List<Event> log; + private final int maxLogSize; - public History(Collection<Event> events) { - this(toImmutableMap(events)); + public History(Collection<Event> events, List<Event> log) { + this(toImmutableMap(events), log, MAX_LOG_SIZE); } - private History(ImmutableMap<Event.Type, Event> events) { + History(ImmutableMap<Event.Type, Event> events, List<Event> log, int maxLogSize) { this.events = events; + this.log = Objects.requireNonNull(log, "log must be non-null") + .stream() + .sorted(Comparator.comparing(Event::at)) + .skip(Math.max(log.size() - maxLogSize, 0)) + .collect(Collectors.toUnmodifiableList()); + this.maxLogSize = maxLogSize; } private static ImmutableMap<Event.Type, Event> toImmutableMap(Collection<Event> events) { @@ -36,7 +50,7 @@ public class History { return builder.build(); } - /** Returns this event if it is present in this history */ + /** Returns the last event of given type, if it is present in this history */ public Optional<Event> event(Event.Type type) { return Optional.ofNullable(events.get(type)); } /** Returns true if a given event is registered in this history at the given time */ @@ -60,18 +74,28 @@ public class History { .orElse(false); } + /** Returns the last event of each type in this history */ public Collection<Event> events() { return events.values(); } + /** + * Returns the events in this history, in chronological order. Compared to {@link #events()}, this holds all events + * as they occurred, up to log size limit + */ + public List<Event> log() { return log; } + /** Returns a copy of this history with the given event added */ public History with(Event event) { ImmutableMap.Builder<Event.Type, Event> builder = builderWithout(event.type()); builder.put(event.type(), event); - return new History(builder.build()); + List<Event> logCopy = new ArrayList<>(log); + logCopy.add(event); + return new History(builder.build(), logCopy, maxLogSize); } - /** Returns a copy of this history with the given event type removed (or an identical history if it was not present) */ + /** Returns a copy of this history with the given event type removed (or an identical history if it was not + * present) and the log unchanged. */ public History without(Event.Type type) { - return new History(builderWithout(type).build()); + return new History(builderWithout(type).build(), log, maxLogSize); } private ImmutableMap.Builder<Event.Type, Event> builderWithout(Event.Type type) { @@ -103,14 +127,14 @@ public class History { /** * Events can be application or node level. - * This returns a copy of this history with all application level events removed. + * This returns a copy of this history with all application level events removed and the log unchanged. */ private History withoutApplicationEvents() { - return new History(events().stream().filter(e -> ! e.type().isApplicationLevel()).collect(Collectors.toList())); + return new History(events().stream().filter(e -> ! e.type().isApplicationLevel()).collect(Collectors.toList()), log); } /** Returns the empty history */ - public static History empty() { return new History(Collections.emptyList()); } + public static History empty() { return new History(List.of(), List.of()); } @Override public String toString() { @@ -198,6 +222,19 @@ public class History { @Override public String toString() { return "'" + type + "' event at " + at + " by " + agent; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Event event = (Event) o; + return at.equals(event.at) && agent == event.agent && type == event.type; + } + + @Override + public int hashCode() { + return Objects.hash(at, agent, type); + } + } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 4990c1e9db8..7eea56c7c4e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -42,6 +42,7 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.time.Instant; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Optional; import java.util.Set; @@ -75,6 +76,7 @@ public class NodeSerializer { private static final String idKey = "openStackId"; private static final String parentHostnameKey = "parentHostname"; private static final String historyKey = "history"; + private static final String logKey = "log"; private static final String instanceKey = "instance"; // legacy name, TODO: change to allocation with backwards compat private static final String rebootGenerationKey = "rebootGeneration"; private static final String currentRebootGenerationKey = "currentRebootGeneration"; @@ -177,7 +179,8 @@ public class NodeSerializer { object.setBool(wantToFailKey, node.status().wantToFail()); object.setBool(wantToRebuildKey, node.status().wantToRebuild()); node.allocation().ifPresent(allocation -> toSlime(allocation, object.setObject(instanceKey))); - toSlime(node.history(), object.setArray(historyKey)); + toSlime(node.history().events(), object.setArray(historyKey)); + toSlime(node.history().log(), object.setArray(logKey)); object.setString(nodeTypeKey, toString(node.type())); node.status().osVersion().current().ifPresent(version -> object.setString(osVersionKey, version.toString())); node.status().osVersion().wanted().ifPresent(version -> object.setString(wantedOsVersionKey, version.toFullString())); @@ -218,8 +221,8 @@ public class NodeSerializer { allocation.networkPorts().ifPresent(ports -> NetworkPortsSerializer.toSlime(ports, object.setArray(networkPortsKey))); } - private void toSlime(History history, Cursor array) { - for (History.Event event : history.events()) + private void toSlime(Collection<History.Event> events, Cursor array) { + for (History.Event event : events) toSlime(event, array.addObject()); } @@ -277,7 +280,7 @@ public class NodeSerializer { statusFromSlime(object), state, allocationFromSlime(flavor.resources(), object.field(instanceKey)), - historyFromSlime(object.field(historyKey)), + historyFromSlime(object), nodeTypeFromString(object.field(nodeTypeKey).asString()), Reports.fromSlime(object.field(reportsKey)), modelNameFromSlime(object), @@ -338,14 +341,20 @@ public class NodeSerializer { InstanceName.from(object.field(instanceIdKey).asString())); } - private History historyFromSlime(Inspector array) { + private History historyFromSlime(Inspector object) { + return new History(eventsFromSlime(object.field(historyKey)), + eventsFromSlime(object.field(logKey))); + } + + private List<History.Event> eventsFromSlime(Inspector array) { + if (!array.valid()) return List.of(); List<History.Event> events = new ArrayList<>(); array.traverse((ArrayTraverser) (int i, Inspector item) -> { History.Event event = eventFromSlime(item); if (event != null) events.add(event); }); - return new History(events); + return events; } private History.Event eventFromSlime(Inspector object) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java index 922c8bc8e20..aa429522147 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java @@ -22,6 +22,7 @@ import com.yahoo.vespa.orchestrator.status.HostInfo; import com.yahoo.vespa.orchestrator.status.HostStatus; import java.net.URI; +import java.util.Collection; import java.util.Comparator; import java.util.List; import java.util.Optional; @@ -171,7 +172,8 @@ class NodesResponse extends SlimeJsonResponse { object.setBool("preferToRetire", node.status().preferToRetire()); object.setBool("wantToDeprovision", node.status().wantToDeprovision()); object.setBool("wantToRebuild", node.status().wantToRebuild()); - toSlime(node.history(), object.setArray("history")); + toSlime(node.history().events(), object.setArray("history")); + toSlime(node.history().log(), object.setArray("log")); ipAddressesToSlime(node.ipConfig().primary(), object.setArray("ipAddresses")); ipAddressesToSlime(node.ipConfig().pool().ipSet(), object.setArray("additionalIpAddresses")); addressesToSlime(node.ipConfig().pool().getAddressList(), object); @@ -196,8 +198,8 @@ class NodesResponse extends SlimeJsonResponse { object.setBool("retired", membership.retired()); } - private void toSlime(History history, Cursor array) { - for (History.Event event : history.events()) { + private void toSlime(Collection<History.Event> events, Cursor array) { + for (History.Event event : events) { Cursor object = array.addObject(); object.setString("event", event.type().name()); object.setLong("at", event.at().toEpochMilli()); |