summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarald Musum <musum@vespa.ai>2024-01-06 00:56:32 +0100
committerGitHub <noreply@github.com>2024-01-06 00:56:32 +0100
commit93203c07633be5c148f4c2b23746f4dac83561b2 (patch)
tree8f1239d3816625b4519324d8c9736f9d6ae64f6b
parente4da75db4556a3cd72b034c4406027f9bba73918 (diff)
Revert "Reset downtime at resume"
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java18
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java29
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java45
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java33
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java21
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java90
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java36
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java8
13 files changed, 71 insertions, 221 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index 48c85aed640..567a5c03f43 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -431,14 +431,20 @@ public final class Node implements Nodelike {
return with(history.with(new History.Event(History.Event.Type.up, agent, instant)));
}
- /** Returns a copy of this with a history event saying it has been suspended at instant. */
- public Node suspendedAt(Instant instant, Agent agent) {
- return with(history.with(new History.Event(History.Event.Type.suspended, agent, instant)));
+ /** Returns true if we have positive evidence that this node is down. */
+ public boolean isDown() {
+ Optional<Instant> downAt = history().event(History.Event.Type.down).map(History.Event::at);
+ if (downAt.isEmpty()) return false;
+
+ return !history().hasEventAfter(History.Event.Type.up, downAt.get());
}
- /** Returns a copy of this with a history event saying it has been resumed at instant. */
- public Node resumedAt(Instant instant, Agent agent) {
- return with(history.with(new History.Event(History.Event.Type.resumed, agent, instant)));
+ /** Returns true if we have positive evidence that this node is up. */
+ public boolean isUp() {
+ Optional<Instant> upAt = history().event(History.Event.Type.up).map(History.Event::at);
+ if (upAt.isEmpty()) return false;
+
+ return !history().hasEventAfter(History.Event.Type.down, upAt.get());
}
/** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index bed3d9fcb04..957996f05e4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -221,7 +221,7 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
}
/** Returns the subset of nodes which have a record of being down */
- public NodeList down() { return matching(node -> node.history().isDown()); }
+ public NodeList down() { return matching(Node::isDown); }
/** Returns the subset of nodes which are being retired */
public NodeList retiring() {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index b644e5d8a08..c203f8a8321 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -268,7 +268,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
boolean down = NodeHealthTracker.allDown(services);
metric.set(ConfigServerMetrics.NODE_FAILER_BAD_NODE.baseName(), (down ? 1 : 0), context);
- boolean nodeDownInNodeRepo = node.history().isDown();;
+ boolean nodeDownInNodeRepo = node.isDown();
metric.set(ConfigServerMetrics.DOWN_IN_NODE_REPO.baseName(), (nodeDownInNodeRepo ? 1 : 0), context);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 27301c9bf2a..6c4be09c489 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -25,7 +25,6 @@ import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
-import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
@@ -34,7 +33,6 @@ import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
-import java.util.stream.Stream;
/**
* Maintains information in the node repo about when this node last responded to ping
@@ -112,7 +110,12 @@ public class NodeFailer extends NodeRepositoryMaintainer {
failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes"));
for (Node node : activeNodes) {
- downSince(node).ifPresent(instant -> failingNodes.add(new FailingNode(node, "Node has been down since " + instant)));
+ Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
+ if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !affectedByMaintenance(node)) {
+ // Allow a grace period after node re-activation
+ if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart))
+ failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit));
+ }
}
for (Node node : activeNodes) {
@@ -146,26 +149,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return !reasonsToFailHost(host).isEmpty();
}
- private Optional<Instant> downSince(Node node) {
- Optional<Instant> downInstant = node.history().downSince();
- if (downInstant.isEmpty()) return Optional.empty();
-
- Instant downSince = Stream.of(downInstant,
- node.history().resumedSince(),
- node.history().event(History.Event.Type.activated).map(History.Event::at))
- .filter(Optional::isPresent)
- .map(Optional::get)
- .max(Comparator.naturalOrder())
- .orElseThrow();
- Duration graceDuration = node.history().isSuspended() ? suspendedDownTimeLimit : downTimeLimit;
- if (clock().instant().isBefore(downSince.plus(graceDuration))) return Optional.empty();
-
- if (applicationSuspended(node)) return Optional.empty();
- if (affectedByMaintenance(node)) return Optional.empty();
-
- return Optional.of(downSince);
- }
-
private boolean applicationSuspended(Node node) {
try {
return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner())
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
index 4dc9c62ce0f..e0421b78ff2 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
@@ -12,10 +12,6 @@ import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
-import com.yahoo.vespa.orchestrator.Orchestrator;
-import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
-import com.yahoo.vespa.orchestrator.status.HostInfo;
-import com.yahoo.vespa.orchestrator.status.HostStatus;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import com.yahoo.yolean.Exceptions;
@@ -24,9 +20,9 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
+import java.util.stream.Collectors;
import static java.util.stream.Collectors.counting;
-import static java.util.stream.Collectors.groupingBy;
/**
* Checks if nodes are responding and updates their status accordingly
@@ -37,13 +33,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
/** Provides (more accurate) information about the status of active hosts */
private final ServiceMonitor serviceMonitor;
- private final Orchestrator orchestrator;
public NodeHealthTracker(ServiceMonitor serviceMonitor, NodeRepository nodeRepository,
Duration interval, Metric metric) {
super(nodeRepository, interval, metric);
this.serviceMonitor = serviceMonitor;
- this.orchestrator = nodeRepository().orchestrator();
}
@Override
@@ -61,14 +55,10 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> {
Optional<Node> node = activeNodes.node(hostname.toString());
if (node.isEmpty()) return;
- boolean isDown = allDown(serviceInstances);
-
- Optional<HostStatus> status = orchestrator.getOptionalNodeStatus(node.get().hostname());
- if (status.isEmpty()) return;
- boolean isSuspended = status.get().isSuspended();
// Already correct record, nothing to do
- if (isDownConsistent(node.get(), isDown) && isSuspendedConsistent(node.get(), isSuspended)) return;
+ boolean isDown = allDown(serviceInstances);
+ if (isDown == node.get().isDown() && isDown != node.get().isUp()) return;
// Lock and update status
ApplicationId owner = node.get().allocation().get().owner();
@@ -76,14 +66,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
node = getNode(hostname.toString(), owner, lock); // Re-get inside lock
if (node.isEmpty()) return; // Node disappeared or changed allocation
attempts.add(1);
-
- Node newNode = node.get();
- if (!isDownConsistent(newNode, isDown))
- newNode = isDown ? newNode.downAt(clock().instant(), Agent.NodeHealthTracker) : newNode.upAt(clock().instant(), Agent.NodeHealthTracker);
- if (!isSuspendedConsistent(newNode, isSuspended))
- newNode = isSuspended ? newNode.suspendedAt(clock().instant(), Agent.NodeHealthTracker) : newNode.resumedAt(clock().instant(), Agent.NodeHealthTracker);
- if (newNode != node.get())
- nodeRepository().nodes().write(newNode, lock);
+ if (isDown) {
+ recordAsDown(node.get(), lock);
+ } else {
+ recordAsUp(node.get(), lock);
+ }
} catch (ApplicationLockException e) {
// Fine, carry on with other nodes. We'll try updating this one in the next run
log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e));
@@ -98,7 +85,8 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
* If a node remains bad for a long time, the NodeFailer will try to fail the node.
*/
static boolean allDown(List<ServiceInstance> services) {
- Map<ServiceStatus, Long> countsByStatus = services.stream().collect(groupingBy(ServiceInstance::serviceStatus, counting()));
+ Map<ServiceStatus, Long> countsByStatus = services.stream()
+ .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting()));
return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L &&
countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L &&
@@ -113,11 +101,16 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
.filter(node -> node.allocation().get().owner().equals(application));
}
- private static boolean isDownConsistent(Node node, boolean isDown) {
- return isDown ? node.history().isDown() : node.history().isUp();
+ /** Record a node as down if not already recorded */
+ private void recordAsDown(Node node, Mutex lock) {
+ if (node.isDown()) return; // already down: Don't change down timestamp
+ nodeRepository().nodes().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock);
}
- private static boolean isSuspendedConsistent(Node node, boolean isSuspended) {
- return isSuspended ? node.history().isSuspended() : node.history().isResumed();
+ /** Clear down record for node, if any */
+ private void recordAsUp(Node node, Mutex lock) {
+ if (node.isUp()) return; // already up: Don't change up timestamp
+ nodeRepository().nodes().write(node.upAt(clock().instant(), Agent.NodeHealthTracker), lock);
}
+
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
index 538b5ccd617..491b388416b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
@@ -12,6 +12,7 @@ import java.util.Comparator;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
+import java.util.stream.Collectors;
/**
* An immutable record of the last event of each type happening to this node, and a chronological log of the events.
@@ -80,34 +81,6 @@ public class History {
.orElse(false);
}
- /** Returns the instant the services went down, unless the services have been up afterward. */
- public Optional<Instant> downSince() { return instantOf(Event.Type.down, Event.Type.up); }
-
- /** Returns the instant the services went up, unless the services have been down afterward. */
- public Optional<Instant> upSince() { return instantOf(Event.Type.up, Event.Type.down); }
-
- /** Returns true if there is a down event without a later up. */
- public boolean isDown() { return downSince().isPresent(); }
-
- /** Returns true if there is an up event without a later down. */
- public boolean isUp() { return upSince().isPresent(); }
-
- /** Returns the instant the node suspended, unless the node has been resumed afterward. */
- public Optional<Instant> suspendedSince() { return instantOf(Event.Type.suspended, Event.Type.resumed); }
-
- /** Returns the instant the node was resumed, unless the node has been suspended afterward. */
- public Optional<Instant> resumedSince() { return instantOf(Event.Type.resumed, Event.Type.suspended); }
-
- /** Returns true if there is a suspended event without a later resumed. */
- public boolean isSuspended() { return suspendedSince().isPresent(); }
-
- /** Returns true if there is a resumed event without a later suspended. */
- public boolean isResumed() { return resumedSince().isPresent(); }
-
- private Optional<Instant> instantOf(History.Event.Type type, History.Event.Type sentinelType) {
- return event(type).map(History.Event::at).filter(instant -> !hasEventAfter(sentinelType, instant));
- }
-
/** Returns the last event of each type in this history */
public Collection<Event> events() { return events.values(); }
@@ -207,10 +180,6 @@ public class History {
down,
// The active node came up according to the service monitor
up,
- // The node has been given permission to suspend by Orchestrator
- suspended,
- // The node has resumed from suspension by Orchestrator
- resumed,
// The node resources/flavor were changed
resized(false),
// The node was rebooted
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
index 9fffdd574dc..5efe5d8b2a8 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
@@ -428,8 +428,6 @@ public class NodeSerializer {
case "deallocated" -> History.Event.Type.deallocated;
case "down" -> History.Event.Type.down;
case "up" -> History.Event.Type.up;
- case "suspended" -> History.Event.Type.suspended;
- case "resumed" -> History.Event.Type.resumed;
case "resized" -> History.Event.Type.resized;
case "rebooted" -> History.Event.Type.rebooted;
case "osUpgraded" -> History.Event.Type.osUpgraded;
@@ -456,8 +454,6 @@ public class NodeSerializer {
case deallocated -> "deallocated";
case down -> "down";
case up -> "up";
- case suspended -> "suspended";
- case resumed -> "resumed";
case resized -> "resized";
case rebooted -> "rebooted";
case osUpgraded -> "osUpgraded";
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
index c518087f325..b8c841771f5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java
@@ -180,7 +180,7 @@ class NodesResponse extends SlimeJsonResponse {
object.setBool("wantToDeprovision", node.status().wantToDeprovision());
object.setBool("wantToRebuild", node.status().wantToRebuild());
object.setBool("wantToUpgradeFlavor", node.status().wantToUpgradeFlavor());
- object.setBool("down", node.history().isDown());
+ object.setBool("down", node.isDown());
toSlime(node.history().events(), object.setArray("history"));
toSlime(node.history().log(), object.setArray("log"));
ipAddressesToSlime(node.ipConfig().primary(), object.setArray("ipAddresses"));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java
index f0b006bcc8e..b818b54070c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java
@@ -1,10 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.testutils;
-import com.google.inject.Inject;
import com.yahoo.component.AbstractComponent;
import com.yahoo.config.provision.ApplicationId;
-import com.yahoo.jdisc.test.TestTimer;
import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference;
import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.orchestrator.Host;
@@ -13,7 +11,6 @@ import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.vespa.orchestrator.status.HostInfo;
import com.yahoo.vespa.orchestrator.status.HostStatus;
-import java.time.Clock;
import java.time.Instant;
import java.util.Collections;
import java.util.HashMap;
@@ -31,14 +28,6 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator
private final Map<HostName, HostInfo> suspendedHosts = new HashMap<>();
private final Set<ApplicationId> suspendedApplications = new HashSet<>();
- private final Clock clock;
-
- @Inject
- public OrchestratorMock() { this(new TestTimer(Instant.EPOCH).toUtcClock()); }
-
- public OrchestratorMock(Clock clock) {
- this.clock = clock;
- }
@Override
public Host getHost(HostName hostName) {
@@ -63,13 +52,7 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator
}
@Override
- public void setNodeStatus(HostName hostName, HostStatus state) {
- if (state == HostStatus.NO_REMARKS) {
- suspendedHosts.remove(hostName);
- } else {
- suspendedHosts.put(hostName, HostInfo.createSuspended(state, clock.instant()));
- }
- }
+ public void setNodeStatus(HostName hostName, HostStatus state) {}
@Override
public void resume(HostName hostName) {
@@ -78,7 +61,7 @@ public class OrchestratorMock extends AbstractComponent implements Orchestrator
@Override
public void suspend(HostName hostName) {
- suspendedHosts.put(hostName, HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, clock.instant()));
+ suspendedHosts.put(hostName, HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, Instant.EPOCH));
}
@Override
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 6100e87c5ec..94090b38cb7 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -7,7 +7,6 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.slime.SlimeUtils;
-import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
@@ -15,7 +14,6 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Report;
-import com.yahoo.vespa.orchestrator.status.HostStatus;
import org.junit.Test;
import java.time.Duration;
@@ -172,8 +170,8 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(65));
tester.runMaintainers();
- assertTrue(tester.nodeRepository.nodes().node(host_from_normal_app).get().history().isDown());
- assertTrue(tester.nodeRepository.nodes().node(host_from_suspended_app).get().history().isDown());
+ assertTrue(tester.nodeRepository.nodes().node(host_from_normal_app).get().isDown());
+ assertTrue(tester.nodeRepository.nodes().node(host_from_suspended_app).get().isDown());
assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(host_from_normal_app).get().state());
assertEquals(Node.State.active, tester.nodeRepository.nodes().node(host_from_suspended_app).get().state());
}
@@ -205,10 +203,8 @@ public class NodeFailerTest {
String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname();
// No liveness evidence yet:
- assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown());
- assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isUp());
- assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isSuspended());
- assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isResumed());
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown());
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp());
// For a day all nodes work so nothing happens
for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
@@ -218,10 +214,8 @@ public class NodeFailerTest {
assertEquals(0, tester.deployer.activations);
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
- assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown());
- assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isUp());
- assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isSuspended());
- assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isResumed());
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown());
+ assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp());
}
tester.serviceMonitor.setHostDown(downHost1);
@@ -233,16 +227,16 @@ public class NodeFailerTest {
assertEquals(0, tester.deployer.activations);
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
- assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isDown());
- assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isUp());
+ assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isDown());
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp());
}
tester.serviceMonitor.setHostUp(downHost1);
// downHost2 should now be failed and replaced, but not downHost1
tester.clock.advance(Duration.ofDays(1));
tester.runMaintainers();
- assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown());
- assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isUp());
+ assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown());
+ assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp());
assertEquals(1, tester.deployer.activations);
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
@@ -320,64 +314,6 @@ public class NodeFailerTest {
}
@Test
- public void suspension_extends_grace_period() {
- NodeFailTester tester = NodeFailTester.withTwoApplications();
- String downNode = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
-
- // host down, but within 1h timeout
- tester.serviceMonitor.setHostDown(downNode);
- tester.runMaintainers();
- assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state());
-
- // 30m is still within 1h timeout
- tester.clock.advance(Duration.ofMinutes(30));
- tester.runMaintainers();
- assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state());
-
- // suspend
- tester.clock.advance(Duration.ofSeconds(5));
- tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.ALLOWED_TO_BE_DOWN);
- tester.runMaintainers();
- assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state());
-
- // the timeout should now be 4h, so still ~3:30 left.
- tester.clock.advance(Duration.ofHours(3));
- tester.runMaintainers();
- assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state());
-
- // advancing another hour takes us beyond the 4h timeout
- tester.clock.advance(Duration.ofHours(1));
- tester.runMaintainers();
- assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state());
- }
-
- @Test
- public void suspension_defers_downtime() {
- NodeFailTester tester = NodeFailTester.withTwoApplications();
- String downNode = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
-
- // host suspends and goes down
- tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.ALLOWED_TO_BE_DOWN);
- tester.serviceMonitor.setHostDown(downNode);
- tester.runMaintainers();
- assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state());
-
- // host resumes after 30m
- tester.clock.advance(Duration.ofMinutes(30));
- tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.NO_REMARKS);
- tester.runMaintainers();
- assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state());
-
- // the host should fail 1h after resume, not when the node goes down. Verify this
- tester.clock.advance(Duration.ofMinutes(45));
- tester.runMaintainers();
- assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state());
- tester.clock.advance(Duration.ofMinutes(30));
- tester.runMaintainers();
- assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state());
- }
-
- @Test
public void node_failing_can_allocate_spare() {
var resources = new NodeResources(1, 20, 15, 1);
Capacity capacity = Capacity.from(new ClusterResources(3, 1, resources), false, true);
@@ -717,21 +653,21 @@ public class NodeFailerTest {
tester.serviceMonitor.setHostDown(downHost);
tester.runMaintainers();
node = tester.nodeRepository.nodes().node(downHost).get();
- assertTrue(node.history().isDown());
+ assertTrue(node.isDown());
assertEquals(Node.State.active, node.state());
// CMR still ongoing, don't fail yet
clock.advance(Duration.ofHours(1));
tester.runMaintainers();
node = tester.nodeRepository.nodes().node(downHost).get();
- assertTrue(node.history().isDown());
+ assertTrue(node.isDown());
assertEquals(Node.State.active, node.state());
// No ongoing CMR anymore, host should be failed
clock.advance(Duration.ofHours(1));
tester.runMaintainers();
node = tester.nodeRepository.nodes().node(downHost).get();
- assertTrue(node.history().isDown());
+ assertTrue(node.isDown());
assertEquals(Node.State.failed, node.state());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java
index d945c8de7b8..bb815168ea7 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java
@@ -73,7 +73,7 @@ public class RetiredExpirerTest {
public void setup() throws OrchestrationException {
// By default, orchestrator should deny all request for suspension so we can test expiration
doThrow(new RuntimeException()).when(orchestrator).acquirePermissionToRemove(any());
- when(orchestrator.getNodeStatus(any(HostName.class))).thenReturn(HostStatus.NO_REMARKS);
+ when(orchestrator.getNodeStatus(any())).thenReturn(HostStatus.NO_REMARKS);
}
@Test
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
index 48bed11d83f..b091603aaeb 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
@@ -4,7 +4,6 @@ package com.yahoo.vespa.hosted.provision.provisioning;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ActivationContext;
import com.yahoo.config.provision.ApplicationId;
-import com.yahoo.config.provision.ApplicationMutex;
import com.yahoo.config.provision.ApplicationName;
import com.yahoo.config.provision.ApplicationTransaction;
import com.yahoo.config.provision.Capacity;
@@ -23,6 +22,7 @@ import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeResources.DiskSpeed;
import com.yahoo.config.provision.NodeResources.StorageType;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.config.provision.ApplicationMutex;
import com.yahoo.config.provision.RegionName;
import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.TenantName;
@@ -100,20 +100,19 @@ public class ProvisioningTester {
private int nextIP = 0;
private ProvisioningTester(Curator curator,
- NodeFlavors nodeFlavors,
- HostResourcesCalculator resourcesCalculator,
- Zone zone,
- NameResolver nameResolver,
- DockerImage containerImage,
- Orchestrator orchestrator,
- HostProvisioner hostProvisioner,
- LoadBalancerServiceMock loadBalancerService,
- FlagSource flagSource,
- int spareCount,
- ManualClock clock) {
+ NodeFlavors nodeFlavors,
+ HostResourcesCalculator resourcesCalculator,
+ Zone zone,
+ NameResolver nameResolver,
+ DockerImage containerImage,
+ Orchestrator orchestrator,
+ HostProvisioner hostProvisioner,
+ LoadBalancerServiceMock loadBalancerService,
+ FlagSource flagSource,
+ int spareCount) {
this.curator = curator;
this.nodeFlavors = nodeFlavors;
- this.clock = clock;
+ this.clock = new ManualClock();
this.hostProvisioner = hostProvisioner;
ProvisionServiceProvider provisionServiceProvider = new MockProvisionServiceProvider(loadBalancerService, hostProvisioner, resourcesCalculator);
this.nodeRepository = new NodeRepository(nodeFlavors,
@@ -659,7 +658,6 @@ public class ProvisioningTester {
private HostProvisioner hostProvisioner;
private FlagSource flagSource;
private int spareCount = 0;
- private ManualClock clock = new ManualClock();
private DockerImage defaultImage = DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa");
public Builder curator(Curator curator) {
@@ -737,11 +735,6 @@ public class ProvisioningTester {
return this;
}
- public Builder clock(ManualClock clock) {
- this.clock = clock;
- return this;
- }
-
private FlagSource defaultFlagSource() {
return new InMemoryFlagSource();
}
@@ -753,12 +746,11 @@ public class ProvisioningTester {
Optional.ofNullable(zone).orElseGet(Zone::defaultZone),
Optional.ofNullable(nameResolver).orElseGet(() -> new MockNameResolver().mockAnyLookup()),
defaultImage,
- Optional.ofNullable(orchestrator).orElseGet(() -> new OrchestratorMock(clock)),
+ Optional.ofNullable(orchestrator).orElseGet(OrchestratorMock::new),
hostProvisioner,
new LoadBalancerServiceMock(),
Optional.ofNullable(flagSource).orElse(defaultFlagSource()),
- spareCount,
- clock);
+ spareCount);
}
private static FlavorsConfig asConfig(List<Flavor> flavors) {
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java
index f3a680478e3..2bbed38a7c4 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java
@@ -49,14 +49,6 @@ public interface Orchestrator {
*/
HostStatus getNodeStatus(HostName hostName) throws HostNameNotFoundException;
- default Optional<HostStatus> getOptionalNodeStatus(String hostname) {
- try {
- return Optional.of(getNodeStatus(new HostName(hostname)));
- } catch (HostNameNotFoundException e) {
- return Optional.empty();
- }
- }
-
/** Get host info for hostname in application, returning no-remarks if not in application. */
HostInfo getHostInfo(ApplicationInstanceReference reference, HostName hostname);