diff options
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa')
9 files changed, 43 insertions, 113 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 48c85aed640..567a5c03f43 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -431,14 +431,20 @@ public final class Node implements Nodelike { return with(history.with(new History.Event(History.Event.Type.up, agent, instant))); } - /** Returns a copy of this with a history event saying it has been suspended at instant. */ - public Node suspendedAt(Instant instant, Agent agent) { - return with(history.with(new History.Event(History.Event.Type.suspended, agent, instant))); + /** Returns true if we have positive evidence that this node is down. */ + public boolean isDown() { + Optional<Instant> downAt = history().event(History.Event.Type.down).map(History.Event::at); + if (downAt.isEmpty()) return false; + + return !history().hasEventAfter(History.Event.Type.up, downAt.get()); } - /** Returns a copy of this with a history event saying it has been resumed at instant. */ - public Node resumedAt(Instant instant, Agent agent) { - return with(history.with(new History.Event(History.Event.Type.resumed, agent, instant))); + /** Returns true if we have positive evidence that this node is up. */ + public boolean isUp() { + Optional<Instant> upAt = history().event(History.Event.Type.up).map(History.Event::at); + if (upAt.isEmpty()) return false; + + return !history().hasEventAfter(History.Event.Type.down, upAt.get()); } /** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index bed3d9fcb04..957996f05e4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -221,7 +221,7 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { } /** Returns the subset of nodes which have a record of being down */ - public NodeList down() { return matching(node -> node.history().isDown()); } + public NodeList down() { return matching(Node::isDown); } /** Returns the subset of nodes which are being retired */ public NodeList retiring() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index b644e5d8a08..c203f8a8321 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -268,7 +268,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { boolean down = NodeHealthTracker.allDown(services); metric.set(ConfigServerMetrics.NODE_FAILER_BAD_NODE.baseName(), (down ? 1 : 0), context); - boolean nodeDownInNodeRepo = node.history().isDown();; + boolean nodeDownInNodeRepo = node.isDown(); metric.set(ConfigServerMetrics.DOWN_IN_NODE_REPO.baseName(), (nodeDownInNodeRepo ? 1 : 0), context); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 27301c9bf2a..6c4be09c489 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -25,7 +25,6 @@ import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; -import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Objects; @@ -34,7 +33,6 @@ import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; -import java.util.stream.Stream; /** * Maintains information in the node repo about when this node last responded to ping @@ -112,7 +110,12 @@ public class NodeFailer extends NodeRepositoryMaintainer { failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes")); for (Node node : activeNodes) { - downSince(node).ifPresent(instant -> failingNodes.add(new FailingNode(node, "Node has been down since " + instant))); + Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); + if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !affectedByMaintenance(node)) { + // Allow a grace period after node re-activation + if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) + failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); + } } for (Node node : activeNodes) { @@ -146,26 +149,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { return !reasonsToFailHost(host).isEmpty(); } - private Optional<Instant> downSince(Node node) { - Optional<Instant> downInstant = node.history().downSince(); - if (downInstant.isEmpty()) return Optional.empty(); - - Instant downSince = Stream.of(downInstant, - node.history().resumedSince(), - node.history().event(History.Event.Type.activated).map(History.Event::at)) - .filter(Optional::isPresent) - .map(Optional::get) - .max(Comparator.naturalOrder()) - .orElseThrow(); - Duration graceDuration = node.history().isSuspended() ? suspendedDownTimeLimit : downTimeLimit; - if (clock().instant().isBefore(downSince.plus(graceDuration))) return Optional.empty(); - - if (applicationSuspended(node)) return Optional.empty(); - if (affectedByMaintenance(node)) return Optional.empty(); - - return Optional.of(downSince); - } - private boolean applicationSuspended(Node node) { try { return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner()) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index 4dc9c62ce0f..e0421b78ff2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -12,10 +12,6 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; -import com.yahoo.vespa.orchestrator.Orchestrator; -import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; -import com.yahoo.vespa.orchestrator.status.HostInfo; -import com.yahoo.vespa.orchestrator.status.HostStatus; import com.yahoo.vespa.service.monitor.ServiceMonitor; import com.yahoo.yolean.Exceptions; @@ -24,9 +20,9 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.logging.Level; +import java.util.stream.Collectors; import static java.util.stream.Collectors.counting; -import static java.util.stream.Collectors.groupingBy; /** * Checks if nodes are responding and updates their status accordingly @@ -37,13 +33,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { /** Provides (more accurate) information about the status of active hosts */ private final ServiceMonitor serviceMonitor; - private final Orchestrator orchestrator; public NodeHealthTracker(ServiceMonitor serviceMonitor, NodeRepository nodeRepository, Duration interval, Metric metric) { super(nodeRepository, interval, metric); this.serviceMonitor = serviceMonitor; - this.orchestrator = nodeRepository().orchestrator(); } @Override @@ -61,14 +55,10 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { Optional<Node> node = activeNodes.node(hostname.toString()); if (node.isEmpty()) return; - boolean isDown = allDown(serviceInstances); - - Optional<HostStatus> status = orchestrator.getOptionalNodeStatus(node.get().hostname()); - if (status.isEmpty()) return; - boolean isSuspended = status.get().isSuspended(); // Already correct record, nothing to do - if (isDownConsistent(node.get(), isDown) && isSuspendedConsistent(node.get(), isSuspended)) return; + boolean isDown = allDown(serviceInstances); + if (isDown == node.get().isDown() && isDown != node.get().isUp()) return; // Lock and update status ApplicationId owner = node.get().allocation().get().owner(); @@ -76,14 +66,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation attempts.add(1); - - Node newNode = node.get(); - if (!isDownConsistent(newNode, isDown)) - newNode = isDown ? newNode.downAt(clock().instant(), Agent.NodeHealthTracker) : newNode.upAt(clock().instant(), Agent.NodeHealthTracker); - if (!isSuspendedConsistent(newNode, isSuspended)) - newNode = isSuspended ? newNode.suspendedAt(clock().instant(), Agent.NodeHealthTracker) : newNode.resumedAt(clock().instant(), Agent.NodeHealthTracker); - if (newNode != node.get()) - nodeRepository().nodes().write(newNode, lock); + if (isDown) { + recordAsDown(node.get(), lock); + } else { + recordAsUp(node.get(), lock); + } } catch (ApplicationLockException e) { // Fine, carry on with other nodes. We'll try updating this one in the next run log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e)); @@ -98,7 +85,8 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { * If a node remains bad for a long time, the NodeFailer will try to fail the node. */ static boolean allDown(List<ServiceInstance> services) { - Map<ServiceStatus, Long> countsByStatus = services.stream().collect(groupingBy(ServiceInstance::serviceStatus, counting())); + Map<ServiceStatus, Long> countsByStatus = services.stream() + .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting())); return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L && countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L && @@ -113,11 +101,16 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { .filter(node -> node.allocation().get().owner().equals(application)); } - private static boolean isDownConsistent(Node node, boolean isDown) { - return isDown ? node.history().isDown() : node.history().isUp(); + /** Record a node as down if not already recorded */ + private void recordAsDown(Node node, Mutex lock) { + if (node.isDown()) return; // already down: Don't change down timestamp + nodeRepository().nodes().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock); } - private static boolean isSuspendedConsistent(Node node, boolean isSuspended) { - return isSuspended ? node.history().isSuspended() : node.history().isResumed(); + /** Clear down record for node, if any */ + private void recordAsUp(Node node, Mutex lock) { + if (node.isUp()) return; // already up: Don't change up timestamp + nodeRepository().nodes().write(node.upAt(clock().instant(), Agent.NodeHealthTracker), lock); } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index 538b5ccd617..491b388416b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -12,6 +12,7 @@ import java.util.Comparator; import java.util.List; import java.util.Objects; import java.util.Optional; +import java.util.stream.Collectors; /** * An immutable record of the last event of each type happening to this node, and a chronological log of the events. @@ -80,34 +81,6 @@ public class History { .orElse(false); } - /** Returns the instant the services went down, unless the services have been up afterward. */ - public Optional<Instant> downSince() { return instantOf(Event.Type.down, Event.Type.up); } - - /** Returns the instant the services went up, unless the services have been down afterward. */ - public Optional<Instant> upSince() { return instantOf(Event.Type.up, Event.Type.down); } - - /** Returns true if there is a down event without a later up. */ - public boolean isDown() { return downSince().isPresent(); } - - /** Returns true if there is an up event without a later down. */ - public boolean isUp() { return upSince().isPresent(); } - - /** Returns the instant the node suspended, unless the node has been resumed afterward. */ - public Optional<Instant> suspendedSince() { return instantOf(Event.Type.suspended, Event.Type.resumed); } - - /** Returns the instant the node was resumed, unless the node has been suspended afterward. */ - public Optional<Instant> resumedSince() { return instantOf(Event.Type.resumed, Event.Type.suspended); } - - /** Returns true if there is a suspended event without a later resumed. */ - public boolean isSuspended() { return suspendedSince().isPresent(); } - - /** Returns true if there is a resumed event without a later suspended. */ - public boolean isResumed() { return resumedSince().isPresent(); } - - private Optional<Instant> instantOf(History.Event.Type type, History.Event.Type sentinelType) { - return event(type).map(History.Event::at).filter(instant -> !hasEventAfter(sentinelType, instant)); - } - /** Returns the last event of each type in this history */ public Collection<Event> events() { return events.values(); } @@ -207,10 +180,6 @@ public class History { down, // The active node came up according to the service monitor up, - // The node has been given permission to suspend by Orchestrator - suspended, - // The node has resumed from suspension by Orchestrator - resumed, // The node resources/flavor were changed resized(false), // The node was rebooted diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 9fffdd574dc..5efe5d8b2a8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -428,8 +428,6 @@ public class NodeSerializer { case "deallocated" -> History.Event.Type.deallocated; case "down" -> History.Event.Type.down; case "up" -> History.Event.Type.up; - case "suspended" -> History.Event.Type.suspended; - case "resumed" -> History.Event.Type.resumed; case "resized" -> History.Event.Type.resized; case "rebooted" -> History.Event.Type.rebooted; case "osUpgraded" -> History.Event.Type.osUpgraded; @@ -456,8 +454,6 @@ public class NodeSerializer { case deallocated -> "deallocated"; case down -> "down"; case up -> "up"; - case suspended -> "suspended"; - case resumed -> "resumed"; case resized -> "resized"; case rebooted -> "rebooted"; case osUpgraded -> "osUpgraded"; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java index c518087f325..b8c841771f5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java @@ -180,7 +180,7 @@ class NodesResponse extends SlimeJsonResponse { object.setBool("wantToDeprovision", node.status().wantToDeprovision()); object.setBool("wantToRebuild", node.status().wantToRebuild()); object.setBool("wantToUpgradeFlavor", node.status().wantToUpgradeFlavor()); - object.setBool("down", node.history().isDown()); + object.setBool("down", node.isDown()); toSlime(node.history().events(), object.setArray("history")); toSlime(node.history().log(), object.setArray("log")); ipAddressesToSlime(node.ipConfig().primary(), object.setArray("ipAddresses")); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java index f0b006bcc8e..b818b54070c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java @@ -1,10 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.testutils; -import com.google.inject.Inject; import com.yahoo.component.AbstractComponent; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.jdisc.test.TestTimer; import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.orchestrator.Host; @@ -13,7 +11,6 @@ import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; import com.yahoo.vespa.orchestrator.status.HostInfo; import com.yahoo.vespa.orchestrator.status.HostStatus; -import java.time.Clock; import java.time.Instant; import java.util.Collections; import java.util.HashMap; @@ -31,14 +28,6 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator private final Map<HostName, HostInfo> suspendedHosts = new HashMap<>(); private final Set<ApplicationId> suspendedApplications = new HashSet<>(); - private final Clock clock; - - @Inject - public OrchestratorMock() { this(new TestTimer(Instant.EPOCH).toUtcClock()); } - - public OrchestratorMock(Clock clock) { - this.clock = clock; - } @Override public Host getHost(HostName hostName) { @@ -63,13 +52,7 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator } @Override - public void setNodeStatus(HostName hostName, HostStatus state) { - if (state == HostStatus.NO_REMARKS) { - suspendedHosts.remove(hostName); - } else { - suspendedHosts.put(hostName, HostInfo.createSuspended(state, clock.instant())); - } - } + public void setNodeStatus(HostName hostName, HostStatus state) {} @Override public void resume(HostName hostName) { @@ -78,7 +61,7 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator @Override public void suspend(HostName hostName) { - suspendedHosts.put(hostName, HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, clock.instant())); + suspendedHosts.put(hostName, HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, Instant.EPOCH)); } @Override |