From db5302e19949e6a3b5989a47631a44d8392ea017 Mon Sep 17 00:00:00 2001 From: Ola Aunrønning Date: Fri, 14 Apr 2023 16:56:41 +0200 Subject: Don't fail nodes undergoing CMR (#26743) --- .../hosted/provision/maintenance/NodeFailer.java | 16 +++++++- .../provision/maintenance/NodeFailerTest.java | 44 ++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) (limited to 'node-repository/src') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index f9ff2f08375..766bc688c62 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -7,6 +7,7 @@ import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.TransientException; import com.yahoo.jdisc.Metric; +import com.yahoo.slime.SlimeUtils; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -109,7 +110,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { for (Node node : activeNodes) { Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); - if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { + if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !undergoingCmr(node)) { // Allow a grace period after node re-activation if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); @@ -157,6 +158,19 @@ public class NodeFailer extends NodeRepositoryMaintainer { } } + private boolean undergoingCmr(Node node) { + return node.reports().getReport("vcmr") + .map(report -> + SlimeUtils.entriesStream(report.getInspector().field("upcoming")) + .anyMatch(cmr -> { + var startTime = cmr.field("plannedStartTime").asLong(); + var endTime = cmr.field("plannedEndTime").asLong(); + var now = clock().instant().getEpochSecond(); + return now > startTime && now < endTime; + }) + ).orElse(false); + } + /** Is the node and all active children suspended? */ private boolean allSuspended(Node node, NodeList activeNodes) { if (!nodeRepository().nodes().suspended(node)) return false; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 491485b78fc..c63be6d5dc5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -6,6 +6,7 @@ import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; +import com.yahoo.slime.SlimeUtils; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; @@ -627,6 +628,49 @@ public class NodeFailerTest { assertFalse(badNode(1, 3, 1, 2)); } + @Test + public void nodes_undergoing_cmr_are_not_failed() { + var tester = NodeFailTester.withTwoApplications(6); + var clock = tester.clock; + var slime = SlimeUtils.jsonToSlime( + String.format(""" + { + "upcoming":[{ + "id": "id-42", + "status": "some-status", + "plannedStartTime": %d, + "plannedEndTime": %d + }] + } + """, clock.instant().getEpochSecond(), clock.instant().plus(Duration.ofMinutes(90)).getEpochSecond()) + ); + var cmrReport = Report.fromSlime("vcmr", slime.get()); + var downHost = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); + + var node = tester.nodeRepository.nodes().node(downHost).get(); + tester.nodeRepository.nodes().write(node.with(node.reports().withReport(cmrReport)), () -> {}); + + tester.serviceMonitor.setHostDown(downHost); + tester.runMaintainers(); + node = tester.nodeRepository.nodes().node(downHost).get(); + assertTrue(node.isDown()); + assertEquals(Node.State.active, node.state()); + + // CMR still ongoing, don't fail yet + clock.advance(Duration.ofHours(1)); + tester.runMaintainers(); + node = tester.nodeRepository.nodes().node(downHost).get(); + assertTrue(node.isDown()); + assertEquals(Node.State.active, node.state()); + + // No ongoing CMR anymore, host should be failed + clock.advance(Duration.ofHours(1)); + tester.runMaintainers(); + node = tester.nodeRepository.nodes().node(downHost).get(); + assertTrue(node.isDown()); + assertEquals(Node.State.failed, node.state()); + } + private void addServiceInstances(List list, ServiceStatus status, int num) { for (int i = 0; i < num; ++i) { ServiceInstance service = mock(ServiceInstance.class); -- cgit v1.2.3