aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-03-01 14:26:52 +0100
committerJon Bratseth <bratseth@gmail.com>2023-03-01 14:26:52 +0100
commit871c58d69fdec8705675fd55993e562ff04a044a (patch)
tree5bafccaa2d0c75300cc339a8086791ded6098bcc /node-repository
parent31db1c671fac8e34f098f9d6d9797c2c1e05d433 (diff)
Record UP events also when not having been down
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java13
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java13
4 files changed, 27 insertions, 11 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index 79382a8bf5b..ba99219638b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -388,7 +388,7 @@ public final class Node implements Nodelike {
return with(history.with(new History.Event(History.Event.Type.up, agent, instant)));
}
- /** Returns whether this node is down, according to its recorded 'down' and 'up' events */
+ /** Returns true if we have positive evidence that this node is down. */
public boolean isDown() {
Optional<Instant> downAt = history().event(History.Event.Type.down).map(History.Event::at);
if (downAt.isEmpty()) return false;
@@ -396,6 +396,14 @@ public final class Node implements Nodelike {
return !history().hasEventAfter(History.Event.Type.up, downAt.get());
}
+ /** Returns true if we have positive evidence that this node is up. */
+ public boolean isUp() {
+ Optional<Instant> upAt = history().event(History.Event.Type.up).map(History.Event::at);
+ if (upAt.isEmpty()) return false;
+
+ return !history().hasEventAfter(History.Event.Type.down, upAt.get());
+ }
+
/** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */
public Node allocate(ApplicationId owner, ClusterMembership membership, NodeResources requestedResources, Instant at) {
return this.with(new Allocation(owner, membership, requestedResources, new Generation(0, 0), false))
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
index 10f3b54dd45..e94ddd8e543 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
@@ -53,7 +53,7 @@ public class ClusterNodesTimeseries {
public String description() {
return "initial measurements: " + initialCount +
- ", after warmpup: " + afterWarmupCount +
+ ", after warmup: " + afterWarmupCount +
", after stable: " + afterStableCount +
", final measurements: " + finalCount;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
index 94683d463af..cc5cbeb554f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
@@ -42,14 +42,13 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
@Override
protected double maintain() {
- return updateActiveNodeDownState();
+ return updateNodeHealth();
}
/**
- * If the node is down (see {@link #allDown}), and there is no "down" history record, we add it.
- * Otherwise we remove any "down" history record.
+ * Update UO and DOWN node records for each node as they change.
*/
- private double updateActiveNodeDownState() {
+ private double updateNodeHealth() {
var attempts = new MutableInteger(0);
var failures = new MutableInteger(0);
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
@@ -59,7 +58,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
// Already correct record, nothing to do
boolean isDown = allDown(serviceInstances);
- if (isDown == node.get().isDown()) return;
+ if (isDown == node.get().isDown() && isDown != node.get().isUp()) return;
// Lock and update status
ApplicationId owner = node.get().allocation().get().owner();
@@ -82,7 +81,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
}
/**
- * Returns true if the node is considered bad: All monitored services services are down.
+ * Returns true if the node is considered bad: All monitored services are down.
* If a node remains bad for a long time, the NodeFailer will try to fail the node.
*/
static boolean allDown(List<ServiceInstance> services) {
@@ -110,7 +109,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
/** Clear down record for node, if any */
private void recordAsUp(Node node, Mutex lock) {
- if (!node.isDown()) return; // already up: Don't change down timestamp
+ if (node.isUp()) return; // already up: Don't change up timestamp
nodeRepository().nodes().write(node.upAt(clock().instant(), Agent.NodeHealthTracker), lock);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 54062e966f1..491485b78fc 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -199,6 +199,11 @@ public class NodeFailerTest {
@Test
public void node_failing() {
NodeFailTester tester = NodeFailTester.withTwoApplications(6);
+ String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
+ String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname();
+ // No liveness evidence yet:
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown());
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp());
// For a day all nodes work so nothing happens
for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
@@ -208,10 +213,10 @@ public class NodeFailerTest {
assertEquals(0, tester.deployer.redeployments);
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown());
+ assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp());
}
- String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
- String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname();
tester.serviceMonitor.setHostDown(downHost1);
tester.serviceMonitor.setHostDown(downHost2);
// nothing happens the first 45 minutes
@@ -221,12 +226,16 @@ public class NodeFailerTest {
assertEquals(0, tester.deployer.redeployments);
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isDown());
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp());
}
tester.serviceMonitor.setHostUp(downHost1);
// downHost2 should now be failed and replaced, but not downHost1
tester.clock.advance(Duration.ofDays(1));
tester.runMaintainers();
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown());
+ assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp());
assertEquals(1, tester.deployer.redeployments);
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());