aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2020-11-13 08:22:34 +0100
committerJon Bratseth <bratseth@gmail.com>2020-11-13 08:22:34 +0100
commitefa0126ed6949e7897102dfaa713d76686246680 (patch)
tree910446e91cb2badff75bca280d73d7690fe44a01
parentee332d88dbf3acdc778b5c5a5d38685c00a27297 (diff)
Don't make changes when there are zone-wide problems
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java21
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java15
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java13
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java3
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java25
12 files changed, 77 insertions, 18 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index 7703da4aab2..298404256b9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -292,6 +292,11 @@ public final class Node implements Nodelike {
return with(history.without(History.Event.Type.down));
}
+ /** Returns whether this node has a record of being down */
+ public boolean isDown() {
+ return history().event(History.Event.Type.down).isPresent();
+ }
+
/** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */
public Node allocate(ApplicationId owner, ClusterMembership membership, NodeResources requestedResources, Instant at) {
return this
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index 17273344594..87a0684909c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -153,6 +153,9 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
return matching(node -> nodeStates.contains(node.state()));
}
+ /** Returns the subset of nodes which have a record of being down */
+ public NodeList down() { return matching(Node::isDown); }
+
/** Returns the subset of nodes which wantToRetire set true */
public NodeList wantToRetire() {
return matching(node -> node.status().wantToRetire());
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
index 845d2763501..663d1d19995 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
@@ -385,6 +385,18 @@ public class NodeRepository extends AbstractComponent {
return List.of(getNodeAcl(node, candidates));
}
+ /**
+ * Returns whether the zone managed by this node repository seems to be working.
+ * If too many nodes are not responding, there is probably some zone-wide issue
+ * and we should probably refrain from making changes to it.
+ */
+ public boolean isWorking() {
+ NodeList activeNodes = list(State.active);
+ if (activeNodes.size() <= 5) return true; // Not enough data to decide
+ NodeList downNodes = activeNodes.down();
+ return ! ( (double)downNodes.size() / (double)activeNodes.size() > 0.2 );
+ }
+
// ----------------- Node lifecycle -----------------------------------------------------------
/** Adds a list of newly created docker container nodes to the node repository as <i>reserved</i> nodes */
@@ -768,7 +780,7 @@ public class NodeRepository extends AbstractComponent {
/**
* Increases the restart generation of the active nodes matching the filter.
*
- * @return the nodes in their new state.
+ * @return the nodes in their new state
*/
public List<Node> restart(NodeFilter filter) {
return performOn(StateFilter.from(State.active, filter),
@@ -778,7 +790,8 @@ public class NodeRepository extends AbstractComponent {
/**
* Increases the reboot generation of the nodes matching the filter.
- * @return the nodes in their new state.
+ *
+ * @return the nodes in their new state
*/
public List<Node> reboot(NodeFilter filter) {
return performOn(filter, (node, lock) -> write(node.withReboot(node.status().reboot().withIncreasedWanted()), lock));
@@ -787,7 +800,7 @@ public class NodeRepository extends AbstractComponent {
/**
* Set target OS version of all nodes matching given filter.
*
- * @return the nodes in their new state.
+ * @return the nodes in their new state
*/
public List<Node> upgradeOs(NodeFilter filter, Optional<Version> version) {
return performOn(filter, (node, lock) -> {
@@ -805,7 +818,7 @@ public class NodeRepository extends AbstractComponent {
* Writes this node after it has changed some internal state but NOT changed its state field.
* This does NOT lock the node repository implicitly, but callers are expected to already hold the lock.
*
- * @param lock Already acquired lock
+ * @param lock already acquired lock
* @return the written node for convenience
*/
public Node write(Node node, Mutex lock) { return write(List.of(node), lock).get(0); }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
index 3659ffd6b47..b93692ec47b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
@@ -46,6 +46,8 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer {
@Override
protected boolean maintain() {
+ if ( ! nodeRepository().isWorking()) return false;
+
boolean success = true;
if ( ! nodeRepository().zone().environment().isProduction()) return success;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index e5284a7b56d..4b6c24573eb 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -62,16 +62,21 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
NodeList nodes = nodeRepository().list();
ServiceModel serviceModel = serviceMonitor.getServiceModelSnapshot();
- updateLockMetrics();
+ updateZoneMetrics();
+ updateCacheMetrics();
+ updateMaintenanceMetrics();
nodes.forEach(node -> updateNodeMetrics(node, serviceModel));
updateNodeCountMetrics(nodes);
- updateMaintenanceMetrics();
+ updateLockMetrics();
updateDockerMetrics(nodes);
updateTenantUsageMetrics(nodes);
- updateCacheMetrics();
return true;
}
+ private void updateZoneMetrics() {
+ metric.set("zone.working", nodeRepository().isWorking() ? 1 : 0, null);
+ }
+
private void updateCacheMetrics() {
CacheStats nodeCacheStats = nodeRepository().database().nodeSerializerCacheStats();
metric.set("cache.nodeObject.hitRate", nodeCacheStats.hitRate(), null);
@@ -183,8 +188,8 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
metric.set("someServicesDown", (numberOfServicesDown > 0 ? 1 : 0), context);
- boolean badNode = NodeFailureStatusUpdater.badNode(services);
- metric.set("nodeFailerBadNode", (badNode ? 1 : 0), context);
+ boolean down = NodeFailureStatusUpdater.allDown(services);
+ metric.set("nodeFailerBadNode", (down ? 1 : 0), context);
boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent();
metric.set("downInNodeRepo", (nodeDownInNodeRepo ? 1 : 0), context);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 9c7e6496ccd..6d571fada9e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -75,6 +75,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
@Override
protected boolean maintain() {
+ if ( ! nodeRepository().isWorking()) return false;
+
int throttledHostFailures = 0;
int throttledNodeFailures = 0;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java
index e143f70c7c3..c10eb2e0568 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java
@@ -16,7 +16,6 @@ import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import com.yahoo.yolean.Exceptions;
-import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.List;
@@ -73,25 +72,25 @@ public class NodeFailureStatusUpdater extends NodeRepositoryMaintainer {
}
/**
- * If the node is down (see {@link #badNode}), and there is no "down" history record, we add it.
+ * If the node is down (see {@link #allDown}), and there is no "down" history record, we add it.
* Otherwise we remove any "down" history record.
*/
private void updateActiveNodeDownState() {
- NodeList activeNodes = NodeList.copyOf(nodeRepository().getNodes(Node.State.active));
+ NodeList activeNodes = nodeRepository().list(Node.State.active);
serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> {
Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first();
if (node.isEmpty()) return;
// Already correct record, nothing to do
- boolean badNode = badNode(serviceInstances);
- if (badNode == node.get().history().event(History.Event.Type.down).isPresent()) return;
+ boolean isDown = allDown(serviceInstances);
+ if (isDown == node.get().isDown()) return;
// Lock and update status
ApplicationId owner = node.get().allocation().get().owner();
try (var lock = nodeRepository().lock(owner)) {
node = getNode(hostname.toString(), owner, lock); // Re-get inside lock
if (node.isEmpty()) return; // Node disappeared or changed allocation
- if (badNode) {
+ if (isDown) {
recordAsDown(node.get(), lock);
} else {
clearDownRecord(node.get(), lock);
@@ -107,7 +106,7 @@ public class NodeFailureStatusUpdater extends NodeRepositoryMaintainer {
* Returns true if the node is considered bad: All monitored services services are down.
* If a node remains bad for a long time, the NodeFailer will try to fail the node.
*/
- static boolean badNode(List<ServiceInstance> services) {
+ static boolean allDown(List<ServiceInstance> services) {
Map<ServiceStatus, Long> countsByStatus = services.stream()
.collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting()));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
index d6b2ca6d170..1651c494f4a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
@@ -34,6 +34,8 @@ public class Rebalancer extends NodeMover<Rebalancer.Move> {
@Override
protected boolean maintain() {
+ if ( ! nodeRepository().isWorking()) return false;
+
boolean success = true;
if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; // Rebalancing not necessary
if (nodeRepository().zone().environment().isTest()) return success; // Short lived deployments; no need to rebalance
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
index 475928863ee..de62ae8266b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
@@ -67,6 +67,8 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
@Override
protected boolean maintain() {
+ if ( ! nodeRepository().isWorking()) return false;
+
boolean success = true;
// Don't need to maintain spare capacity in dynamically provisioned zones; can provision more on demand.
if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
index 94d1b6eb93e..b490cdf4c24 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
@@ -33,6 +33,8 @@ public class SwitchRebalancer extends NodeMover<Move> {
@Override
protected boolean maintain() {
+ if ( ! nodeRepository().isWorking()) return false;
+
boolean success = true;
// Using node list without holding lock as strong consistency is not needed here
NodeList allNodes = nodeRepository().list();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
index 194ff070425..dbc0a98d879 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
@@ -87,6 +87,7 @@ public class MetricsReporterTest {
tester.makeProvisionedNodes(1, "default", NodeType.proxy, 0);
Map<String, Number> expectedMetrics = new TreeMap<>();
+ expectedMetrics.put("zone.working", 1);
expectedMetrics.put("hostedVespa.provisionedHosts", 1);
expectedMetrics.put("hostedVespa.parkedHosts", 0);
expectedMetrics.put("hostedVespa.readyHosts", 0);
@@ -121,7 +122,7 @@ public class MetricsReporterTest {
expectedMetrics.put("cache.nodeObject.size", 2L);
nodeRepository.list();
- expectedMetrics.put("cache.curator.hitRate", 0.5D);
+ expectedMetrics.put("cache.curator.hitRate", 0.52D);
expectedMetrics.put("cache.curator.evictionCount", 0L);
expectedMetrics.put("cache.curator.size", 12L);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index dcb70eed099..08e67e54946 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -180,11 +180,34 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(65));
tester.runMaintainers();
+ assertTrue(tester.nodeRepository.getNode(host_from_normal_app).get().isDown());
+ assertTrue(tester.nodeRepository.getNode(host_from_suspended_app).get().isDown());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(host_from_normal_app).get().state());
assertEquals(Node.State.active, tester.nodeRepository.getNode(host_from_suspended_app).get().state());
}
@Test
+ public void zone_is_not_working_if_too_many_nodes_down() {
+ NodeFailTester tester = NodeFailTester.withTwoApplications();
+
+ tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(0).hostname());
+ tester.runMaintainers();
+ assertTrue(tester.nodeRepository.isWorking());
+
+ tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname());
+ tester.runMaintainers();
+ assertTrue(tester.nodeRepository.isWorking());
+
+ tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(2).hostname());
+ tester.runMaintainers();
+ assertFalse(tester.nodeRepository.isWorking());
+
+ tester.clock.advance(Duration.ofMinutes(65));
+ tester.runMaintainers();
+ assertTrue("Node failing is deactivated", tester.nodeRepository.list(Node.State.failed).isEmpty());
+ }
+
+ @Test
public void node_failing() {
NodeFailTester tester = NodeFailTester.withTwoApplications();
@@ -694,7 +717,7 @@ public class NodeFailerTest {
addServiceInstances(services, ServiceStatus.NOT_CHECKED, numNotChecked);
Collections.shuffle(services);
- return NodeFailureStatusUpdater.badNode(services);
+ return NodeFailureStatusUpdater.allDown(services);
}
/**