From 83f531bf6da495a7803ebbf3350265ebdd2bab19 Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Wed, 10 Jan 2024 09:46:29 +0100 Subject: Reset downtime at resume, 2. try --- .../com/yahoo/vespa/hosted/provision/Node.java | 18 ++--- .../com/yahoo/vespa/hosted/provision/NodeList.java | 2 +- .../provision/maintenance/MetricsReporter.java | 2 +- .../hosted/provision/maintenance/NodeFailer.java | 29 +++++-- .../provision/maintenance/NodeHealthTracker.java | 45 ++++++----- .../yahoo/vespa/hosted/provision/node/History.java | 29 ++++++- .../provision/persistence/NodeSerializer.java | 3 +- .../hosted/provision/restapi/NodesResponse.java | 2 +- .../provision/testutils/OrchestratorMock.java | 21 ++++- .../provision/maintenance/NodeFailerTest.java | 90 ++++++++++++++++++---- .../provision/maintenance/RetiredExpirerTest.java | 2 +- .../provision/provisioning/ProvisioningTester.java | 36 +++++---- 12 files changed, 207 insertions(+), 72 deletions(-) (limited to 'node-repository/src') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 567a5c03f43..48c85aed640 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -431,20 +431,14 @@ public final class Node implements Nodelike { return with(history.with(new History.Event(History.Event.Type.up, agent, instant))); } - /** Returns true if we have positive evidence that this node is down. */ - public boolean isDown() { - Optional downAt = history().event(History.Event.Type.down).map(History.Event::at); - if (downAt.isEmpty()) return false; - - return !history().hasEventAfter(History.Event.Type.up, downAt.get()); + /** Returns a copy of this with a history event saying it has been suspended at instant. */ + public Node suspendedAt(Instant instant, Agent agent) { + return with(history.with(new History.Event(History.Event.Type.suspended, agent, instant))); } - /** Returns true if we have positive evidence that this node is up. */ - public boolean isUp() { - Optional upAt = history().event(History.Event.Type.up).map(History.Event::at); - if (upAt.isEmpty()) return false; - - return !history().hasEventAfter(History.Event.Type.down, upAt.get()); + /** Returns a copy of this with a history event saying it has been resumed at instant. */ + public Node resumedAt(Instant instant, Agent agent) { + return with(history.with(new History.Event(History.Event.Type.resumed, agent, instant))); } /** Returns a copy of this with allocation set as specified. node.state is *not* changed. */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index 957996f05e4..bed3d9fcb04 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -221,7 +221,7 @@ public class NodeList extends AbstractFilteringList { } /** Returns the subset of nodes which have a record of being down */ - public NodeList down() { return matching(Node::isDown); } + public NodeList down() { return matching(node -> node.history().isDown()); } /** Returns the subset of nodes which are being retired */ public NodeList retiring() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index c203f8a8321..b644e5d8a08 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -268,7 +268,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { boolean down = NodeHealthTracker.allDown(services); metric.set(ConfigServerMetrics.NODE_FAILER_BAD_NODE.baseName(), (down ? 1 : 0), context); - boolean nodeDownInNodeRepo = node.isDown(); + boolean nodeDownInNodeRepo = node.history().isDown();; metric.set(ConfigServerMetrics.DOWN_IN_NODE_REPO.baseName(), (nodeDownInNodeRepo ? 1 : 0), context); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 6c4be09c489..27301c9bf2a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -25,6 +25,7 @@ import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Objects; @@ -33,6 +34,7 @@ import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Maintains information in the node repo about when this node last responded to ping @@ -110,12 +112,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes")); for (Node node : activeNodes) { - Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); - if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !affectedByMaintenance(node)) { - // Allow a grace period after node re-activation - if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) - failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); - } + downSince(node).ifPresent(instant -> failingNodes.add(new FailingNode(node, "Node has been down since " + instant))); } for (Node node : activeNodes) { @@ -149,6 +146,26 @@ public class NodeFailer extends NodeRepositoryMaintainer { return !reasonsToFailHost(host).isEmpty(); } + private Optional downSince(Node node) { + Optional downInstant = node.history().downSince(); + if (downInstant.isEmpty()) return Optional.empty(); + + Instant downSince = Stream.of(downInstant, + node.history().resumedSince(), + node.history().event(History.Event.Type.activated).map(History.Event::at)) + .filter(Optional::isPresent) + .map(Optional::get) + .max(Comparator.naturalOrder()) + .orElseThrow(); + Duration graceDuration = node.history().isSuspended() ? suspendedDownTimeLimit : downTimeLimit; + if (clock().instant().isBefore(downSince.plus(graceDuration))) return Optional.empty(); + + if (applicationSuspended(node)) return Optional.empty(); + if (affectedByMaintenance(node)) return Optional.empty(); + + return Optional.of(downSince); + } + private boolean applicationSuspended(Node node) { try { return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner()) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index e0421b78ff2..4dc9c62ce0f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -12,6 +12,10 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.orchestrator.Orchestrator; +import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; +import com.yahoo.vespa.orchestrator.status.HostInfo; +import com.yahoo.vespa.orchestrator.status.HostStatus; import com.yahoo.vespa.service.monitor.ServiceMonitor; import com.yahoo.yolean.Exceptions; @@ -20,9 +24,9 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.logging.Level; -import java.util.stream.Collectors; import static java.util.stream.Collectors.counting; +import static java.util.stream.Collectors.groupingBy; /** * Checks if nodes are responding and updates their status accordingly @@ -33,11 +37,13 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { /** Provides (more accurate) information about the status of active hosts */ private final ServiceMonitor serviceMonitor; + private final Orchestrator orchestrator; public NodeHealthTracker(ServiceMonitor serviceMonitor, NodeRepository nodeRepository, Duration interval, Metric metric) { super(nodeRepository, interval, metric); this.serviceMonitor = serviceMonitor; + this.orchestrator = nodeRepository().orchestrator(); } @Override @@ -55,10 +61,14 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { Optional node = activeNodes.node(hostname.toString()); if (node.isEmpty()) return; + boolean isDown = allDown(serviceInstances); + + Optional status = orchestrator.getOptionalNodeStatus(node.get().hostname()); + if (status.isEmpty()) return; + boolean isSuspended = status.get().isSuspended(); // Already correct record, nothing to do - boolean isDown = allDown(serviceInstances); - if (isDown == node.get().isDown() && isDown != node.get().isUp()) return; + if (isDownConsistent(node.get(), isDown) && isSuspendedConsistent(node.get(), isSuspended)) return; // Lock and update status ApplicationId owner = node.get().allocation().get().owner(); @@ -66,11 +76,14 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation attempts.add(1); - if (isDown) { - recordAsDown(node.get(), lock); - } else { - recordAsUp(node.get(), lock); - } + + Node newNode = node.get(); + if (!isDownConsistent(newNode, isDown)) + newNode = isDown ? newNode.downAt(clock().instant(), Agent.NodeHealthTracker) : newNode.upAt(clock().instant(), Agent.NodeHealthTracker); + if (!isSuspendedConsistent(newNode, isSuspended)) + newNode = isSuspended ? newNode.suspendedAt(clock().instant(), Agent.NodeHealthTracker) : newNode.resumedAt(clock().instant(), Agent.NodeHealthTracker); + if (newNode != node.get()) + nodeRepository().nodes().write(newNode, lock); } catch (ApplicationLockException e) { // Fine, carry on with other nodes. We'll try updating this one in the next run log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e)); @@ -85,8 +98,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { * If a node remains bad for a long time, the NodeFailer will try to fail the node. */ static boolean allDown(List services) { - Map countsByStatus = services.stream() - .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting())); + Map countsByStatus = services.stream().collect(groupingBy(ServiceInstance::serviceStatus, counting())); return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L && countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L && @@ -101,16 +113,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { .filter(node -> node.allocation().get().owner().equals(application)); } - /** Record a node as down if not already recorded */ - private void recordAsDown(Node node, Mutex lock) { - if (node.isDown()) return; // already down: Don't change down timestamp - nodeRepository().nodes().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock); + private static boolean isDownConsistent(Node node, boolean isDown) { + return isDown ? node.history().isDown() : node.history().isUp(); } - /** Clear down record for node, if any */ - private void recordAsUp(Node node, Mutex lock) { - if (node.isUp()) return; // already up: Don't change up timestamp - nodeRepository().nodes().write(node.upAt(clock().instant(), Agent.NodeHealthTracker), lock); + private static boolean isSuspendedConsistent(Node node, boolean isSuspended) { + return isSuspended ? node.history().isSuspended() : node.history().isResumed(); } - } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index 7c33fd14769..538b5ccd617 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -12,7 +12,6 @@ import java.util.Comparator; import java.util.List; import java.util.Objects; import java.util.Optional; -import java.util.stream.Collectors; /** * An immutable record of the last event of each type happening to this node, and a chronological log of the events. @@ -81,6 +80,34 @@ public class History { .orElse(false); } + /** Returns the instant the services went down, unless the services have been up afterward. */ + public Optional downSince() { return instantOf(Event.Type.down, Event.Type.up); } + + /** Returns the instant the services went up, unless the services have been down afterward. */ + public Optional upSince() { return instantOf(Event.Type.up, Event.Type.down); } + + /** Returns true if there is a down event without a later up. */ + public boolean isDown() { return downSince().isPresent(); } + + /** Returns true if there is an up event without a later down. */ + public boolean isUp() { return upSince().isPresent(); } + + /** Returns the instant the node suspended, unless the node has been resumed afterward. */ + public Optional suspendedSince() { return instantOf(Event.Type.suspended, Event.Type.resumed); } + + /** Returns the instant the node was resumed, unless the node has been suspended afterward. */ + public Optional resumedSince() { return instantOf(Event.Type.resumed, Event.Type.suspended); } + + /** Returns true if there is a suspended event without a later resumed. */ + public boolean isSuspended() { return suspendedSince().isPresent(); } + + /** Returns true if there is a resumed event without a later suspended. */ + public boolean isResumed() { return resumedSince().isPresent(); } + + private Optional instantOf(History.Event.Type type, History.Event.Type sentinelType) { + return event(type).map(History.Event::at).filter(instant -> !hasEventAfter(sentinelType, instant)); + } + /** Returns the last event of each type in this history */ public Collection events() { return events.values(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index e160a85a4e2..9fffdd574dc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -457,7 +457,8 @@ public class NodeSerializer { case down -> "down"; case up -> "up"; case suspended -> "suspended"; - case resumed -> "resumed"; case resized -> "resized"; + case resumed -> "resumed"; + case resized -> "resized"; case rebooted -> "rebooted"; case osUpgraded -> "osUpgraded"; case firmwareVerified -> "firmwareVerified"; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java index b8c841771f5..c518087f325 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java @@ -180,7 +180,7 @@ class NodesResponse extends SlimeJsonResponse { object.setBool("wantToDeprovision", node.status().wantToDeprovision()); object.setBool("wantToRebuild", node.status().wantToRebuild()); object.setBool("wantToUpgradeFlavor", node.status().wantToUpgradeFlavor()); - object.setBool("down", node.isDown()); + object.setBool("down", node.history().isDown()); toSlime(node.history().events(), object.setArray("history")); toSlime(node.history().log(), object.setArray("log")); ipAddressesToSlime(node.ipConfig().primary(), object.setArray("ipAddresses")); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java index b818b54070c..f0b006bcc8e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java @@ -1,8 +1,10 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.testutils; +import com.google.inject.Inject; import com.yahoo.component.AbstractComponent; import com.yahoo.config.provision.ApplicationId; +import com.yahoo.jdisc.test.TestTimer; import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.orchestrator.Host; @@ -11,6 +13,7 @@ import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; import com.yahoo.vespa.orchestrator.status.HostInfo; import com.yahoo.vespa.orchestrator.status.HostStatus; +import java.time.Clock; import java.time.Instant; import java.util.Collections; import java.util.HashMap; @@ -28,6 +31,14 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator private final Map suspendedHosts = new HashMap<>(); private final Set suspendedApplications = new HashSet<>(); + private final Clock clock; + + @Inject + public OrchestratorMock() { this(new TestTimer(Instant.EPOCH).toUtcClock()); } + + public OrchestratorMock(Clock clock) { + this.clock = clock; + } @Override public Host getHost(HostName hostName) { @@ -52,7 +63,13 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator } @Override - public void setNodeStatus(HostName hostName, HostStatus state) {} + public void setNodeStatus(HostName hostName, HostStatus state) { + if (state == HostStatus.NO_REMARKS) { + suspendedHosts.remove(hostName); + } else { + suspendedHosts.put(hostName, HostInfo.createSuspended(state, clock.instant())); + } + } @Override public void resume(HostName hostName) { @@ -61,7 +78,7 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator @Override public void suspend(HostName hostName) { - suspendedHosts.put(hostName, HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, Instant.EPOCH)); + suspendedHosts.put(hostName, HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, clock.instant())); } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 94090b38cb7..6100e87c5ec 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -7,6 +7,7 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.slime.SlimeUtils; +import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; @@ -14,6 +15,7 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Report; +import com.yahoo.vespa.orchestrator.status.HostStatus; import org.junit.Test; import java.time.Duration; @@ -170,8 +172,8 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(65)); tester.runMaintainers(); - assertTrue(tester.nodeRepository.nodes().node(host_from_normal_app).get().isDown()); - assertTrue(tester.nodeRepository.nodes().node(host_from_suspended_app).get().isDown()); + assertTrue(tester.nodeRepository.nodes().node(host_from_normal_app).get().history().isDown()); + assertTrue(tester.nodeRepository.nodes().node(host_from_suspended_app).get().history().isDown()); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(host_from_normal_app).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(host_from_suspended_app).get().state()); } @@ -203,8 +205,10 @@ public class NodeFailerTest { String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname(); // No liveness evidence yet: - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isSuspended()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isResumed()); // For a day all nodes work so nothing happens for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { @@ -214,8 +218,10 @@ public class NodeFailerTest { assertEquals(0, tester.deployer.activations); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); - assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isSuspended()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isResumed()); } tester.serviceMonitor.setHostDown(downHost1); @@ -227,16 +233,16 @@ public class NodeFailerTest { assertEquals(0, tester.deployer.activations); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isDown()); - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isDown()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isUp()); } tester.serviceMonitor.setHostUp(downHost1); // downHost2 should now be failed and replaced, but not downHost1 tester.clock.advance(Duration.ofDays(1)); tester.runMaintainers(); - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); - assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isUp()); assertEquals(1, tester.deployer.activations); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); @@ -313,6 +319,64 @@ public class NodeFailerTest { assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state()); } + @Test + public void suspension_extends_grace_period() { + NodeFailTester tester = NodeFailTester.withTwoApplications(); + String downNode = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); + + // host down, but within 1h timeout + tester.serviceMonitor.setHostDown(downNode); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // 30m is still within 1h timeout + tester.clock.advance(Duration.ofMinutes(30)); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // suspend + tester.clock.advance(Duration.ofSeconds(5)); + tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.ALLOWED_TO_BE_DOWN); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // the timeout should now be 4h, so still ~3:30 left. + tester.clock.advance(Duration.ofHours(3)); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // advancing another hour takes us beyond the 4h timeout + tester.clock.advance(Duration.ofHours(1)); + tester.runMaintainers(); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state()); + } + + @Test + public void suspension_defers_downtime() { + NodeFailTester tester = NodeFailTester.withTwoApplications(); + String downNode = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); + + // host suspends and goes down + tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.ALLOWED_TO_BE_DOWN); + tester.serviceMonitor.setHostDown(downNode); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // host resumes after 30m + tester.clock.advance(Duration.ofMinutes(30)); + tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.NO_REMARKS); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // the host should fail 1h after resume, not when the node goes down. Verify this + tester.clock.advance(Duration.ofMinutes(45)); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + tester.clock.advance(Duration.ofMinutes(30)); + tester.runMaintainers(); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state()); + } + @Test public void node_failing_can_allocate_spare() { var resources = new NodeResources(1, 20, 15, 1); @@ -653,21 +717,21 @@ public class NodeFailerTest { tester.serviceMonitor.setHostDown(downHost); tester.runMaintainers(); node = tester.nodeRepository.nodes().node(downHost).get(); - assertTrue(node.isDown()); + assertTrue(node.history().isDown()); assertEquals(Node.State.active, node.state()); // CMR still ongoing, don't fail yet clock.advance(Duration.ofHours(1)); tester.runMaintainers(); node = tester.nodeRepository.nodes().node(downHost).get(); - assertTrue(node.isDown()); + assertTrue(node.history().isDown()); assertEquals(Node.State.active, node.state()); // No ongoing CMR anymore, host should be failed clock.advance(Duration.ofHours(1)); tester.runMaintainers(); node = tester.nodeRepository.nodes().node(downHost).get(); - assertTrue(node.isDown()); + assertTrue(node.history().isDown()); assertEquals(Node.State.failed, node.state()); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java index bb815168ea7..d945c8de7b8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java @@ -73,7 +73,7 @@ public class RetiredExpirerTest { public void setup() throws OrchestrationException { // By default, orchestrator should deny all request for suspension so we can test expiration doThrow(new RuntimeException()).when(orchestrator).acquirePermissionToRemove(any()); - when(orchestrator.getNodeStatus(any())).thenReturn(HostStatus.NO_REMARKS); + when(orchestrator.getNodeStatus(any(HostName.class))).thenReturn(HostStatus.NO_REMARKS); } @Test diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index b091603aaeb..48bed11d83f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.component.Version; import com.yahoo.config.provision.ActivationContext; import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ApplicationMutex; import com.yahoo.config.provision.ApplicationName; import com.yahoo.config.provision.ApplicationTransaction; import com.yahoo.config.provision.Capacity; @@ -22,7 +23,6 @@ import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeResources.DiskSpeed; import com.yahoo.config.provision.NodeResources.StorageType; import com.yahoo.config.provision.NodeType; -import com.yahoo.config.provision.ApplicationMutex; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.TenantName; @@ -100,19 +100,20 @@ public class ProvisioningTester { private int nextIP = 0; private ProvisioningTester(Curator curator, - NodeFlavors nodeFlavors, - HostResourcesCalculator resourcesCalculator, - Zone zone, - NameResolver nameResolver, - DockerImage containerImage, - Orchestrator orchestrator, - HostProvisioner hostProvisioner, - LoadBalancerServiceMock loadBalancerService, - FlagSource flagSource, - int spareCount) { + NodeFlavors nodeFlavors, + HostResourcesCalculator resourcesCalculator, + Zone zone, + NameResolver nameResolver, + DockerImage containerImage, + Orchestrator orchestrator, + HostProvisioner hostProvisioner, + LoadBalancerServiceMock loadBalancerService, + FlagSource flagSource, + int spareCount, + ManualClock clock) { this.curator = curator; this.nodeFlavors = nodeFlavors; - this.clock = new ManualClock(); + this.clock = clock; this.hostProvisioner = hostProvisioner; ProvisionServiceProvider provisionServiceProvider = new MockProvisionServiceProvider(loadBalancerService, hostProvisioner, resourcesCalculator); this.nodeRepository = new NodeRepository(nodeFlavors, @@ -658,6 +659,7 @@ public class ProvisioningTester { private HostProvisioner hostProvisioner; private FlagSource flagSource; private int spareCount = 0; + private ManualClock clock = new ManualClock(); private DockerImage defaultImage = DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"); public Builder curator(Curator curator) { @@ -735,6 +737,11 @@ public class ProvisioningTester { return this; } + public Builder clock(ManualClock clock) { + this.clock = clock; + return this; + } + private FlagSource defaultFlagSource() { return new InMemoryFlagSource(); } @@ -746,11 +753,12 @@ public class ProvisioningTester { Optional.ofNullable(zone).orElseGet(Zone::defaultZone), Optional.ofNullable(nameResolver).orElseGet(() -> new MockNameResolver().mockAnyLookup()), defaultImage, - Optional.ofNullable(orchestrator).orElseGet(OrchestratorMock::new), + Optional.ofNullable(orchestrator).orElseGet(() -> new OrchestratorMock(clock)), hostProvisioner, new LoadBalancerServiceMock(), Optional.ofNullable(flagSource).orElse(defaultFlagSource()), - spareCount); + spareCount, + clock); } private static FlavorsConfig asConfig(List flavors) { -- cgit v1.2.3