aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorOla Aunrønning <olaa@yahooinc.com>2023-04-14 16:56:41 +0200
committerGitHub <noreply@github.com>2023-04-14 16:56:41 +0200
commitdb5302e19949e6a3b5989a47631a44d8392ea017 (patch)
treef9d3ded90d7f086a2785e92dbfdb3a47422f06a5 /node-repository
parentd025a93015e66efc0027d81a64e70530d6cb240e (diff)
Don't fail nodes undergoing CMR (#26743)
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java16
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java44
2 files changed, 59 insertions, 1 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index f9ff2f08375..766bc688c62 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -7,6 +7,7 @@ import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
+import com.yahoo.slime.SlimeUtils;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
@@ -109,7 +110,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
for (Node node : activeNodes) {
Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
- if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
+ if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !undergoingCmr(node)) {
// Allow a grace period after node re-activation
if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart))
failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit));
@@ -157,6 +158,19 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
}
+ private boolean undergoingCmr(Node node) {
+ return node.reports().getReport("vcmr")
+ .map(report ->
+ SlimeUtils.entriesStream(report.getInspector().field("upcoming"))
+ .anyMatch(cmr -> {
+ var startTime = cmr.field("plannedStartTime").asLong();
+ var endTime = cmr.field("plannedEndTime").asLong();
+ var now = clock().instant().getEpochSecond();
+ return now > startTime && now < endTime;
+ })
+ ).orElse(false);
+ }
+
/** Is the node and all active children suspended? */
private boolean allSuspended(Node node, NodeList activeNodes) {
if (!nodeRepository().nodes().suspended(node)) return false;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 491485b78fc..c63be6d5dc5 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -6,6 +6,7 @@ import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.slime.SlimeUtils;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
@@ -627,6 +628,49 @@ public class NodeFailerTest {
assertFalse(badNode(1, 3, 1, 2));
}
+ @Test
+ public void nodes_undergoing_cmr_are_not_failed() {
+ var tester = NodeFailTester.withTwoApplications(6);
+ var clock = tester.clock;
+ var slime = SlimeUtils.jsonToSlime(
+ String.format("""
+ {
+ "upcoming":[{
+ "id": "id-42",
+ "status": "some-status",
+ "plannedStartTime": %d,
+ "plannedEndTime": %d
+ }]
+ }
+ """, clock.instant().getEpochSecond(), clock.instant().plus(Duration.ofMinutes(90)).getEpochSecond())
+ );
+ var cmrReport = Report.fromSlime("vcmr", slime.get());
+ var downHost = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
+
+ var node = tester.nodeRepository.nodes().node(downHost).get();
+ tester.nodeRepository.nodes().write(node.with(node.reports().withReport(cmrReport)), () -> {});
+
+ tester.serviceMonitor.setHostDown(downHost);
+ tester.runMaintainers();
+ node = tester.nodeRepository.nodes().node(downHost).get();
+ assertTrue(node.isDown());
+ assertEquals(Node.State.active, node.state());
+
+ // CMR still ongoing, don't fail yet
+ clock.advance(Duration.ofHours(1));
+ tester.runMaintainers();
+ node = tester.nodeRepository.nodes().node(downHost).get();
+ assertTrue(node.isDown());
+ assertEquals(Node.State.active, node.state());
+
+ // No ongoing CMR anymore, host should be failed
+ clock.advance(Duration.ofHours(1));
+ tester.runMaintainers();
+ node = tester.nodeRepository.nodes().node(downHost).get();
+ assertTrue(node.isDown());
+ assertEquals(Node.State.failed, node.state());
+ }
+
private void addServiceInstances(List<ServiceInstance> list, ServiceStatus status, int num) {
for (int i = 0; i < num; ++i) {
ServiceInstance service = mock(ServiceInstance.class);