diff options
author | Harald Musum <musum@vespa.ai> | 2024-01-06 00:56:32 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-06 00:56:32 +0100 |
commit | 93203c07633be5c148f4c2b23746f4dac83561b2 (patch) | |
tree | 8f1239d3816625b4519324d8c9736f9d6ae64f6b /node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java | |
parent | e4da75db4556a3cd72b034c4406027f9bba73918 (diff) |
Revert "Reset downtime at resume"
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java | 45 |
1 files changed, 19 insertions, 26 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index 4dc9c62ce0f..e0421b78ff2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -12,10 +12,6 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; -import com.yahoo.vespa.orchestrator.Orchestrator; -import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; -import com.yahoo.vespa.orchestrator.status.HostInfo; -import com.yahoo.vespa.orchestrator.status.HostStatus; import com.yahoo.vespa.service.monitor.ServiceMonitor; import com.yahoo.yolean.Exceptions; @@ -24,9 +20,9 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.logging.Level; +import java.util.stream.Collectors; import static java.util.stream.Collectors.counting; -import static java.util.stream.Collectors.groupingBy; /** * Checks if nodes are responding and updates their status accordingly @@ -37,13 +33,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { /** Provides (more accurate) information about the status of active hosts */ private final ServiceMonitor serviceMonitor; - private final Orchestrator orchestrator; public NodeHealthTracker(ServiceMonitor serviceMonitor, NodeRepository nodeRepository, Duration interval, Metric metric) { super(nodeRepository, interval, metric); this.serviceMonitor = serviceMonitor; - this.orchestrator = nodeRepository().orchestrator(); } @Override @@ -61,14 +55,10 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { Optional<Node> node = activeNodes.node(hostname.toString()); if (node.isEmpty()) return; - boolean isDown = allDown(serviceInstances); - - Optional<HostStatus> status = orchestrator.getOptionalNodeStatus(node.get().hostname()); - if (status.isEmpty()) return; - boolean isSuspended = status.get().isSuspended(); // Already correct record, nothing to do - if (isDownConsistent(node.get(), isDown) && isSuspendedConsistent(node.get(), isSuspended)) return; + boolean isDown = allDown(serviceInstances); + if (isDown == node.get().isDown() && isDown != node.get().isUp()) return; // Lock and update status ApplicationId owner = node.get().allocation().get().owner(); @@ -76,14 +66,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation attempts.add(1); - - Node newNode = node.get(); - if (!isDownConsistent(newNode, isDown)) - newNode = isDown ? newNode.downAt(clock().instant(), Agent.NodeHealthTracker) : newNode.upAt(clock().instant(), Agent.NodeHealthTracker); - if (!isSuspendedConsistent(newNode, isSuspended)) - newNode = isSuspended ? newNode.suspendedAt(clock().instant(), Agent.NodeHealthTracker) : newNode.resumedAt(clock().instant(), Agent.NodeHealthTracker); - if (newNode != node.get()) - nodeRepository().nodes().write(newNode, lock); + if (isDown) { + recordAsDown(node.get(), lock); + } else { + recordAsUp(node.get(), lock); + } } catch (ApplicationLockException e) { // Fine, carry on with other nodes. We'll try updating this one in the next run log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e)); @@ -98,7 +85,8 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { * If a node remains bad for a long time, the NodeFailer will try to fail the node. */ static boolean allDown(List<ServiceInstance> services) { - Map<ServiceStatus, Long> countsByStatus = services.stream().collect(groupingBy(ServiceInstance::serviceStatus, counting())); + Map<ServiceStatus, Long> countsByStatus = services.stream() + .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting())); return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L && countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L && @@ -113,11 +101,16 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { .filter(node -> node.allocation().get().owner().equals(application)); } - private static boolean isDownConsistent(Node node, boolean isDown) { - return isDown ? node.history().isDown() : node.history().isUp(); + /** Record a node as down if not already recorded */ + private void recordAsDown(Node node, Mutex lock) { + if (node.isDown()) return; // already down: Don't change down timestamp + nodeRepository().nodes().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock); } - private static boolean isSuspendedConsistent(Node node, boolean isSuspended) { - return isSuspended ? node.history().isSuspended() : node.history().isResumed(); + /** Clear down record for node, if any */ + private void recordAsUp(Node node, Mutex lock) { + if (node.isUp()) return; // already up: Don't change up timestamp + nodeRepository().nodes().write(node.upAt(clock().instant(), Agent.NodeHealthTracker), lock); } + } |