diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2023-03-01 14:58:07 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-03-01 14:58:07 +0100 |
commit | d164f53c5043637bce5b3a79df36bb53ecabb051 (patch) | |
tree | 67b20b4728b59d18caf3f27d255a129b17f4b58b /node-repository | |
parent | 0c766a6506aa82299c2cd1d166fcc3522a962f2f (diff) | |
parent | 3be30d7343e6d2c56f1d167cc5d47275acff6e00 (diff) |
Merge pull request #26250 from vespa-engine/bratseth/up
Record UP events also when not having been down
Diffstat (limited to 'node-repository')
4 files changed, 27 insertions, 11 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 79382a8bf5b..ba99219638b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -388,7 +388,7 @@ public final class Node implements Nodelike { return with(history.with(new History.Event(History.Event.Type.up, agent, instant))); } - /** Returns whether this node is down, according to its recorded 'down' and 'up' events */ + /** Returns true if we have positive evidence that this node is down. */ public boolean isDown() { Optional<Instant> downAt = history().event(History.Event.Type.down).map(History.Event::at); if (downAt.isEmpty()) return false; @@ -396,6 +396,14 @@ public final class Node implements Nodelike { return !history().hasEventAfter(History.Event.Type.up, downAt.get()); } + /** Returns true if we have positive evidence that this node is up. */ + public boolean isUp() { + Optional<Instant> upAt = history().event(History.Event.Type.up).map(History.Event::at); + if (upAt.isEmpty()) return false; + + return !history().hasEventAfter(History.Event.Type.down, upAt.get()); + } + /** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */ public Node allocate(ApplicationId owner, ClusterMembership membership, NodeResources requestedResources, Instant at) { return this.with(new Allocation(owner, membership, requestedResources, new Generation(0, 0), false)) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java index 10f3b54dd45..e94ddd8e543 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java @@ -53,7 +53,7 @@ public class ClusterNodesTimeseries { public String description() { return "initial measurements: " + initialCount + - ", after warmpup: " + afterWarmupCount + + ", after warmup: " + afterWarmupCount + ", after stable: " + afterStableCount + ", final measurements: " + finalCount; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index 94683d463af..781debe26a0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -42,14 +42,13 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { @Override protected double maintain() { - return updateActiveNodeDownState(); + return updateNodeHealth(); } /** - * If the node is down (see {@link #allDown}), and there is no "down" history record, we add it. - * Otherwise we remove any "down" history record. + * Update UP and DOWN node records for each node as they change. */ - private double updateActiveNodeDownState() { + private double updateNodeHealth() { var attempts = new MutableInteger(0); var failures = new MutableInteger(0); NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); @@ -59,7 +58,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { // Already correct record, nothing to do boolean isDown = allDown(serviceInstances); - if (isDown == node.get().isDown()) return; + if (isDown == node.get().isDown() && isDown != node.get().isUp()) return; // Lock and update status ApplicationId owner = node.get().allocation().get().owner(); @@ -82,7 +81,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } /** - * Returns true if the node is considered bad: All monitored services services are down. + * Returns true if the node is considered bad: All monitored services are down. * If a node remains bad for a long time, the NodeFailer will try to fail the node. */ static boolean allDown(List<ServiceInstance> services) { @@ -110,7 +109,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { /** Clear down record for node, if any */ private void recordAsUp(Node node, Mutex lock) { - if (!node.isDown()) return; // already up: Don't change down timestamp + if (node.isUp()) return; // already up: Don't change up timestamp nodeRepository().nodes().write(node.upAt(clock().instant(), Agent.NodeHealthTracker), lock); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 54062e966f1..491485b78fc 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -199,6 +199,11 @@ public class NodeFailerTest { @Test public void node_failing() { NodeFailTester tester = NodeFailTester.withTwoApplications(6); + String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); + String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname(); + // No liveness evidence yet: + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp()); // For a day all nodes work so nothing happens for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { @@ -208,10 +213,10 @@ public class NodeFailerTest { assertEquals(0, tester.deployer.redeployments); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp()); } - String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); - String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname(); tester.serviceMonitor.setHostDown(downHost1); tester.serviceMonitor.setHostDown(downHost2); // nothing happens the first 45 minutes @@ -221,12 +226,16 @@ public class NodeFailerTest { assertEquals(0, tester.deployer.redeployments); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isDown()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp()); } tester.serviceMonitor.setHostUp(downHost1); // downHost2 should now be failed and replaced, but not downHost1 tester.clock.advance(Duration.ofDays(1)); tester.runMaintainers(); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp()); assertEquals(1, tester.deployer.redeployments); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); |