From cfe0bb3a0dc6f1d02a55746bb0e128b73796210d Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Wed, 10 Jan 2018 14:52:13 +0100 Subject: Fail node only if all monitored services are down --- .../provision/maintenance/MetricsReporter.java | 7 +++ .../hosted/provision/maintenance/NodeFailer.java | 50 ++++++++++++++-------- .../provision/testutils/ServiceMonitorStub.java | 8 +--- .../provision/maintenance/NodeFailerTest.java | 46 ++++++++++++++++---- 4 files changed, 76 insertions(+), 35 deletions(-) (limited to 'node-repository/src') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 8448597fdee..aeb30320bda 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -12,6 +12,7 @@ import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Allocation; +import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.hosted.provision.provisioning.DockerHostCapacity; import com.yahoo.vespa.orchestrator.HostNameNotFoundException; import com.yahoo.vespa.orchestrator.Orchestrator; @@ -152,6 +153,12 @@ public class MetricsReporter extends Maintainer { metric.set("numberOfServicesDown", numberOfServicesDown, context); metric.set("someServicesDown", (numberOfServicesDown > 0 ? 1 : 0), context); + + boolean badNode = NodeFailer.badNode(services); + metric.set("nodeFailerBadNode", (badNode ? 1 : 0), context); + + boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent(); + metric.set("downInNodeRepo", (nodeDownInNodeRepo ? 1 : 0), context); } metric.set("numberOfServices", numberOfServices, context); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 266d91e7e3e..dbff2ece060 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -6,8 +6,6 @@ import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.NodeType; import com.yahoo.transaction.Mutex; -import com.yahoo.vespa.applicationmodel.ApplicationInstance; -import com.yahoo.vespa.applicationmodel.ServiceCluster; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; @@ -29,6 +27,9 @@ import java.util.Map; import java.util.Optional; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.stream.Collectors; + +import static java.util.stream.Collectors.counting; /** * Maintains information in the node repo about when this node last responded to ping @@ -169,27 +170,38 @@ public class NodeFailer extends Maintainer { } /** - * If the node is positively DOWN, and there is no "down" history record, we add it. - * If the node is positively UP we remove any "down" history record. + * If a node remains bad for a long time, the NodeFailer will eventually try to fail the node. + */ + public static boolean badNode(List services) { + Map countsByStatus = services.stream() + .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting())); + + return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L && + countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L; + } + + /** + * If the node is down (see badNode()), and there is no "down" history record, we add it. + * Otherwise we remove any "down" history record. * - * @return a list of all nodes which are positively currently in the down state + * @return a list of all nodes that should be considered as down */ private List determineActiveNodeDownStatus() { List downNodes = new ArrayList<>(); - for (ApplicationInstance application : serviceMonitor.getAllApplicationInstances().values()) { - for (ServiceCluster cluster : application.serviceClusters()) { - for (ServiceInstance service : cluster.serviceInstances()) { - Optional node = nodeRepository().getNode(service.hostName().s(), Node.State.active); - if ( ! node.isPresent()) continue; // we also get status from infrastructure nodes, which are not in the repo. TODO: remove when proxy nodes are in node repo everywhere - - if (service.serviceStatus().equals(ServiceStatus.DOWN)) - downNodes.add(recordAsDown(node.get())); - else if (service.serviceStatus().equals(ServiceStatus.UP)) - clearDownRecord(node.get()); - // else: we don't know current status; don't take any action until we have positive information - } - } - } + serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName() + .entrySet().stream().forEach( + entry -> { + Optional node = nodeRepository().getNode(entry.getKey().s(), Node.State.active); + if (node.isPresent()) { + if (badNode(entry.getValue())) { + downNodes.add(recordAsDown(node.get())); + } else { + clearDownRecord(node.get()); + } + } + } + ); + return downNodes; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java index 0bb054ae3ae..8215dc8ecd0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java @@ -34,7 +34,6 @@ public class ServiceMonitorStub implements ServiceMonitor { private final NodeRepository nodeRepository; private Set downHosts = new HashSet<>(); - private boolean statusIsKnown = true; /** Create a service monitor where all nodes are initially up */ @Inject @@ -57,12 +56,7 @@ public class ServiceMonitorStub implements ServiceMonitor { downHosts.remove(hostname); } - public void setStatusIsKnown(boolean statusIsKnown) { - this.statusIsKnown = statusIsKnown; - } - private ServiceStatus getHostStatus(String hostname) { - if (!statusIsKnown) return ServiceStatus.NOT_CHECKED; if (downHosts.contains(hostname)) return ServiceStatus.DOWN; return ServiceStatus.UP; } @@ -92,6 +86,6 @@ public class ServiceMonitorStub implements ServiceMonitor { @Override public ServiceModel getServiceModelSnapshot() { - throw new UnsupportedOperationException("getServicemodelSnapshot has not been implemented"); + return new ServiceModel(getAllApplicationInstances()); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index b9b871dfd1f..f95cfc1b0f1 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -3,6 +3,8 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; +import com.yahoo.vespa.applicationmodel.ServiceInstance; +import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; @@ -10,7 +12,9 @@ import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException; import org.junit.Test; import java.time.Duration; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -22,6 +26,8 @@ import java.util.stream.Stream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; /** * Tests automatic failing of nodes. @@ -107,20 +113,13 @@ public class NodeFailerTest { tester.failer.run(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); - // the system goes down and do not have updated information when coming back + // the system goes down tester.clock.advance(Duration.ofMinutes(120)); tester.failer = tester.createFailer(); - tester.serviceMonitor.setStatusIsKnown(false); tester.failer.run(); - // due to this, nothing is failed - assertEquals( 1, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - // when status becomes known, and the host is still down, it is failed + // the host is still down and fails tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); - tester.serviceMonitor.setStatusIsKnown(true); tester.failer.run(); assertEquals( 2, tester.deployer.redeployments); assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); @@ -457,6 +456,35 @@ public class NodeFailerTest { } } + @Test + public void testUpness() { + assertFalse(badNode(0, 0, 0)); + assertFalse(badNode(0, 0, 2)); + assertFalse(badNode(0, 3, 0)); + assertFalse(badNode(0, 3, 2)); + assertTrue(badNode(1, 0, 0)); + assertTrue(badNode(1, 0, 2)); + assertFalse(badNode(1, 3, 0)); + assertFalse(badNode(1, 3, 2)); + } + + private void addServiceInstances(List list, ServiceStatus status, int num) { + for (int i = 0; i < num; ++i) { + ServiceInstance service = mock(ServiceInstance.class); + when(service.serviceStatus()).thenReturn(status); + list.add(service); + } + } + + private boolean badNode(int numDown, int numUp, int numNotChecked) { + List services = new ArrayList<>(); + addServiceInstances(services, ServiceStatus.DOWN, numDown); + addServiceInstances(services, ServiceStatus.UP, numUp); + addServiceInstances(services, ServiceStatus.NOT_CHECKED, numNotChecked); + Collections.shuffle(services); + + return NodeFailer.badNode(services); + } /** * Selects the first parent host that: -- cgit v1.2.3