diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-11-13 08:22:34 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-11-13 08:22:34 +0100 |
commit | efa0126ed6949e7897102dfaa713d76686246680 (patch) | |
tree | 910446e91cb2badff75bca280d73d7690fe44a01 | |
parent | ee332d88dbf3acdc778b5c5a5d38685c00a27297 (diff) |
Don't make changes when there are zone-wide problems
12 files changed, 77 insertions, 18 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 7703da4aab2..298404256b9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -292,6 +292,11 @@ public final class Node implements Nodelike { return with(history.without(History.Event.Type.down)); } + /** Returns whether this node has a record of being down */ + public boolean isDown() { + return history().event(History.Event.Type.down).isPresent(); + } + /** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */ public Node allocate(ApplicationId owner, ClusterMembership membership, NodeResources requestedResources, Instant at) { return this diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index 17273344594..87a0684909c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -153,6 +153,9 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { return matching(node -> nodeStates.contains(node.state())); } + /** Returns the subset of nodes which have a record of being down */ + public NodeList down() { return matching(Node::isDown); } + /** Returns the subset of nodes which wantToRetire set true */ public NodeList wantToRetire() { return matching(node -> node.status().wantToRetire()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 845d2763501..663d1d19995 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -385,6 +385,18 @@ public class NodeRepository extends AbstractComponent { return List.of(getNodeAcl(node, candidates)); } + /** + * Returns whether the zone managed by this node repository seems to be working. + * If too many nodes are not responding, there is probably some zone-wide issue + * and we should probably refrain from making changes to it. + */ + public boolean isWorking() { + NodeList activeNodes = list(State.active); + if (activeNodes.size() <= 5) return true; // Not enough data to decide + NodeList downNodes = activeNodes.down(); + return ! ( (double)downNodes.size() / (double)activeNodes.size() > 0.2 ); + } + // ----------------- Node lifecycle ----------------------------------------------------------- /** Adds a list of newly created docker container nodes to the node repository as <i>reserved</i> nodes */ @@ -768,7 +780,7 @@ public class NodeRepository extends AbstractComponent { /** * Increases the restart generation of the active nodes matching the filter. * - * @return the nodes in their new state. + * @return the nodes in their new state */ public List<Node> restart(NodeFilter filter) { return performOn(StateFilter.from(State.active, filter), @@ -778,7 +790,8 @@ public class NodeRepository extends AbstractComponent { /** * Increases the reboot generation of the nodes matching the filter. - * @return the nodes in their new state. + * + * @return the nodes in their new state */ public List<Node> reboot(NodeFilter filter) { return performOn(filter, (node, lock) -> write(node.withReboot(node.status().reboot().withIncreasedWanted()), lock)); @@ -787,7 +800,7 @@ public class NodeRepository extends AbstractComponent { /** * Set target OS version of all nodes matching given filter. * - * @return the nodes in their new state. + * @return the nodes in their new state */ public List<Node> upgradeOs(NodeFilter filter, Optional<Version> version) { return performOn(filter, (node, lock) -> { @@ -805,7 +818,7 @@ public class NodeRepository extends AbstractComponent { * Writes this node after it has changed some internal state but NOT changed its state field. * This does NOT lock the node repository implicitly, but callers are expected to already hold the lock. * - * @param lock Already acquired lock + * @param lock already acquired lock * @return the written node for convenience */ public Node write(Node node, Mutex lock) { return write(List.of(node), lock).get(0); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index 3659ffd6b47..b93692ec47b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -46,6 +46,8 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { + if ( ! nodeRepository().isWorking()) return false; + boolean success = true; if ( ! nodeRepository().zone().environment().isProduction()) return success; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index e5284a7b56d..4b6c24573eb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -62,16 +62,21 @@ public class MetricsReporter extends NodeRepositoryMaintainer { NodeList nodes = nodeRepository().list(); ServiceModel serviceModel = serviceMonitor.getServiceModelSnapshot(); - updateLockMetrics(); + updateZoneMetrics(); + updateCacheMetrics(); + updateMaintenanceMetrics(); nodes.forEach(node -> updateNodeMetrics(node, serviceModel)); updateNodeCountMetrics(nodes); - updateMaintenanceMetrics(); + updateLockMetrics(); updateDockerMetrics(nodes); updateTenantUsageMetrics(nodes); - updateCacheMetrics(); return true; } + private void updateZoneMetrics() { + metric.set("zone.working", nodeRepository().isWorking() ? 1 : 0, null); + } + private void updateCacheMetrics() { CacheStats nodeCacheStats = nodeRepository().database().nodeSerializerCacheStats(); metric.set("cache.nodeObject.hitRate", nodeCacheStats.hitRate(), null); @@ -183,8 +188,8 @@ public class MetricsReporter extends NodeRepositoryMaintainer { metric.set("someServicesDown", (numberOfServicesDown > 0 ? 1 : 0), context); - boolean badNode = NodeFailureStatusUpdater.badNode(services); - metric.set("nodeFailerBadNode", (badNode ? 1 : 0), context); + boolean down = NodeFailureStatusUpdater.allDown(services); + metric.set("nodeFailerBadNode", (down ? 1 : 0), context); boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent(); metric.set("downInNodeRepo", (nodeDownInNodeRepo ? 1 : 0), context); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 9c7e6496ccd..6d571fada9e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -75,6 +75,8 @@ public class NodeFailer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { + if ( ! nodeRepository().isWorking()) return false; + int throttledHostFailures = 0; int throttledNodeFailures = 0; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java index e143f70c7c3..c10eb2e0568 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailureStatusUpdater.java @@ -16,7 +16,6 @@ import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.service.monitor.ServiceMonitor; import com.yahoo.yolean.Exceptions; -import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.List; @@ -73,25 +72,25 @@ public class NodeFailureStatusUpdater extends NodeRepositoryMaintainer { } /** - * If the node is down (see {@link #badNode}), and there is no "down" history record, we add it. + * If the node is down (see {@link #allDown}), and there is no "down" history record, we add it. * Otherwise we remove any "down" history record. */ private void updateActiveNodeDownState() { - NodeList activeNodes = NodeList.copyOf(nodeRepository().getNodes(Node.State.active)); + NodeList activeNodes = nodeRepository().list(Node.State.active); serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first(); if (node.isEmpty()) return; // Already correct record, nothing to do - boolean badNode = badNode(serviceInstances); - if (badNode == node.get().history().event(History.Event.Type.down).isPresent()) return; + boolean isDown = allDown(serviceInstances); + if (isDown == node.get().isDown()) return; // Lock and update status ApplicationId owner = node.get().allocation().get().owner(); try (var lock = nodeRepository().lock(owner)) { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation - if (badNode) { + if (isDown) { recordAsDown(node.get(), lock); } else { clearDownRecord(node.get(), lock); @@ -107,7 +106,7 @@ public class NodeFailureStatusUpdater extends NodeRepositoryMaintainer { * Returns true if the node is considered bad: All monitored services services are down. * If a node remains bad for a long time, the NodeFailer will try to fail the node. */ - static boolean badNode(List<ServiceInstance> services) { + static boolean allDown(List<ServiceInstance> services) { Map<ServiceStatus, Long> countsByStatus = services.stream() .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting())); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index d6b2ca6d170..1651c494f4a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -34,6 +34,8 @@ public class Rebalancer extends NodeMover<Rebalancer.Move> { @Override protected boolean maintain() { + if ( ! nodeRepository().isWorking()) return false; + boolean success = true; if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; // Rebalancing not necessary if (nodeRepository().zone().environment().isTest()) return success; // Short lived deployments; no need to rebalance diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 475928863ee..de62ae8266b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -67,6 +67,8 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { + if ( ! nodeRepository().isWorking()) return false; + boolean success = true; // Don't need to maintain spare capacity in dynamically provisioned zones; can provision more on demand. if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java index 94d1b6eb93e..b490cdf4c24 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java @@ -33,6 +33,8 @@ public class SwitchRebalancer extends NodeMover<Move> { @Override protected boolean maintain() { + if ( ! nodeRepository().isWorking()) return false; + boolean success = true; // Using node list without holding lock as strong consistency is not needed here NodeList allNodes = nodeRepository().list(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index 194ff070425..dbc0a98d879 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -87,6 +87,7 @@ public class MetricsReporterTest { tester.makeProvisionedNodes(1, "default", NodeType.proxy, 0); Map<String, Number> expectedMetrics = new TreeMap<>(); + expectedMetrics.put("zone.working", 1); expectedMetrics.put("hostedVespa.provisionedHosts", 1); expectedMetrics.put("hostedVespa.parkedHosts", 0); expectedMetrics.put("hostedVespa.readyHosts", 0); @@ -121,7 +122,7 @@ public class MetricsReporterTest { expectedMetrics.put("cache.nodeObject.size", 2L); nodeRepository.list(); - expectedMetrics.put("cache.curator.hitRate", 0.5D); + expectedMetrics.put("cache.curator.hitRate", 0.52D); expectedMetrics.put("cache.curator.evictionCount", 0L); expectedMetrics.put("cache.curator.size", 12L); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index dcb70eed099..08e67e54946 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -180,11 +180,34 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(65)); tester.runMaintainers(); + assertTrue(tester.nodeRepository.getNode(host_from_normal_app).get().isDown()); + assertTrue(tester.nodeRepository.getNode(host_from_suspended_app).get().isDown()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(host_from_normal_app).get().state()); assertEquals(Node.State.active, tester.nodeRepository.getNode(host_from_suspended_app).get().state()); } @Test + public void zone_is_not_working_if_too_many_nodes_down() { + NodeFailTester tester = NodeFailTester.withTwoApplications(); + + tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(0).hostname()); + tester.runMaintainers(); + assertTrue(tester.nodeRepository.isWorking()); + + tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname()); + tester.runMaintainers(); + assertTrue(tester.nodeRepository.isWorking()); + + tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(2).hostname()); + tester.runMaintainers(); + assertFalse(tester.nodeRepository.isWorking()); + + tester.clock.advance(Duration.ofMinutes(65)); + tester.runMaintainers(); + assertTrue("Node failing is deactivated", tester.nodeRepository.list(Node.State.failed).isEmpty()); + } + + @Test public void node_failing() { NodeFailTester tester = NodeFailTester.withTwoApplications(); @@ -694,7 +717,7 @@ public class NodeFailerTest { addServiceInstances(services, ServiceStatus.NOT_CHECKED, numNotChecked); Collections.shuffle(services); - return NodeFailureStatusUpdater.badNode(services); + return NodeFailureStatusUpdater.allDown(services); } /** |