summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@oath.com>2018-01-10 14:52:13 +0100
committerHåkon Hallingstad <hakon@oath.com>2018-01-10 14:52:13 +0100
commitcfe0bb3a0dc6f1d02a55746bb0e128b73796210d (patch)
tree24356198728935edf2f2649eebbc62fdddcb6a67 /node-repository
parent3e1477f5fda4a3dcd436a6d41843adc66e19f370 (diff)
Fail node only if all monitored services are down
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java50
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java8
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java46
4 files changed, 76 insertions, 35 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 8448597fdee..aeb30320bda 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -12,6 +12,7 @@ import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Allocation;
+import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.hosted.provision.provisioning.DockerHostCapacity;
import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
import com.yahoo.vespa.orchestrator.Orchestrator;
@@ -152,6 +153,12 @@ public class MetricsReporter extends Maintainer {
metric.set("numberOfServicesDown", numberOfServicesDown, context);
metric.set("someServicesDown", (numberOfServicesDown > 0 ? 1 : 0), context);
+
+ boolean badNode = NodeFailer.badNode(services);
+ metric.set("nodeFailerBadNode", (badNode ? 1 : 0), context);
+
+ boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent();
+ metric.set("downInNodeRepo", (nodeDownInNodeRepo ? 1 : 0), context);
}
metric.set("numberOfServices", numberOfServices, context);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 266d91e7e3e..dbff2ece060 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -6,8 +6,6 @@ import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
import com.yahoo.transaction.Mutex;
-import com.yahoo.vespa.applicationmodel.ApplicationInstance;
-import com.yahoo.vespa.applicationmodel.ServiceCluster;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
@@ -29,6 +27,9 @@ import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
+import static java.util.stream.Collectors.counting;
/**
* Maintains information in the node repo about when this node last responded to ping
@@ -169,27 +170,38 @@ public class NodeFailer extends Maintainer {
}
/**
- * If the node is positively DOWN, and there is no "down" history record, we add it.
- * If the node is positively UP we remove any "down" history record.
+ * If a node remains bad for a long time, the NodeFailer will eventually try to fail the node.
+ */
+ public static boolean badNode(List<ServiceInstance> services) {
+ Map<ServiceStatus, Long> countsByStatus = services.stream()
+ .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting()));
+
+ return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L &&
+ countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L;
+ }
+
+ /**
+ * If the node is down (see badNode()), and there is no "down" history record, we add it.
+ * Otherwise we remove any "down" history record.
*
- * @return a list of all nodes which are positively currently in the down state
+ * @return a list of all nodes that should be considered as down
*/
private List<Node> determineActiveNodeDownStatus() {
List<Node> downNodes = new ArrayList<>();
- for (ApplicationInstance application : serviceMonitor.getAllApplicationInstances().values()) {
- for (ServiceCluster cluster : application.serviceClusters()) {
- for (ServiceInstance service : cluster.serviceInstances()) {
- Optional<Node> node = nodeRepository().getNode(service.hostName().s(), Node.State.active);
- if ( ! node.isPresent()) continue; // we also get status from infrastructure nodes, which are not in the repo. TODO: remove when proxy nodes are in node repo everywhere
-
- if (service.serviceStatus().equals(ServiceStatus.DOWN))
- downNodes.add(recordAsDown(node.get()));
- else if (service.serviceStatus().equals(ServiceStatus.UP))
- clearDownRecord(node.get());
- // else: we don't know current status; don't take any action until we have positive information
- }
- }
- }
+ serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName()
+ .entrySet().stream().forEach(
+ entry -> {
+ Optional<Node> node = nodeRepository().getNode(entry.getKey().s(), Node.State.active);
+ if (node.isPresent()) {
+ if (badNode(entry.getValue())) {
+ downNodes.add(recordAsDown(node.get()));
+ } else {
+ clearDownRecord(node.get());
+ }
+ }
+ }
+ );
+
return downNodes;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java
index 0bb054ae3ae..8215dc8ecd0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java
@@ -34,7 +34,6 @@ public class ServiceMonitorStub implements ServiceMonitor {
private final NodeRepository nodeRepository;
private Set<String> downHosts = new HashSet<>();
- private boolean statusIsKnown = true;
/** Create a service monitor where all nodes are initially up */
@Inject
@@ -57,12 +56,7 @@ public class ServiceMonitorStub implements ServiceMonitor {
downHosts.remove(hostname);
}
- public void setStatusIsKnown(boolean statusIsKnown) {
- this.statusIsKnown = statusIsKnown;
- }
-
private ServiceStatus getHostStatus(String hostname) {
- if (!statusIsKnown) return ServiceStatus.NOT_CHECKED;
if (downHosts.contains(hostname)) return ServiceStatus.DOWN;
return ServiceStatus.UP;
}
@@ -92,6 +86,6 @@ public class ServiceMonitorStub implements ServiceMonitor {
@Override
public ServiceModel getServiceModelSnapshot() {
- throw new UnsupportedOperationException("getServicemodelSnapshot has not been implemented");
+ return new ServiceModel(getAllApplicationInstances());
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index b9b871dfd1f..f95cfc1b0f1 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -3,6 +3,8 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.vespa.applicationmodel.ServiceInstance;
+import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
@@ -10,7 +12,9 @@ import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException;
import org.junit.Test;
import java.time.Duration;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@@ -22,6 +26,8 @@ import java.util.stream.Stream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
/**
* Tests automatic failing of nodes.
@@ -107,20 +113,13 @@ public class NodeFailerTest {
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
- // the system goes down and do not have updated information when coming back
+ // the system goes down
tester.clock.advance(Duration.ofMinutes(120));
tester.failer = tester.createFailer();
- tester.serviceMonitor.setStatusIsKnown(false);
tester.failer.run();
- // due to this, nothing is failed
- assertEquals( 1, tester.deployer.redeployments);
- assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
- assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
- // when status becomes known, and the host is still down, it is failed
+ // the host is still down and fails
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
- tester.serviceMonitor.setStatusIsKnown(true);
tester.failer.run();
assertEquals( 2, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
@@ -457,6 +456,35 @@ public class NodeFailerTest {
}
}
+ @Test
+ public void testUpness() {
+ assertFalse(badNode(0, 0, 0));
+ assertFalse(badNode(0, 0, 2));
+ assertFalse(badNode(0, 3, 0));
+ assertFalse(badNode(0, 3, 2));
+ assertTrue(badNode(1, 0, 0));
+ assertTrue(badNode(1, 0, 2));
+ assertFalse(badNode(1, 3, 0));
+ assertFalse(badNode(1, 3, 2));
+ }
+
+ private void addServiceInstances(List<ServiceInstance> list, ServiceStatus status, int num) {
+ for (int i = 0; i < num; ++i) {
+ ServiceInstance service = mock(ServiceInstance.class);
+ when(service.serviceStatus()).thenReturn(status);
+ list.add(service);
+ }
+ }
+
+ private boolean badNode(int numDown, int numUp, int numNotChecked) {
+ List<ServiceInstance> services = new ArrayList<>();
+ addServiceInstances(services, ServiceStatus.DOWN, numDown);
+ addServiceInstances(services, ServiceStatus.UP, numUp);
+ addServiceInstances(services, ServiceStatus.NOT_CHECKED, numNotChecked);
+ Collections.shuffle(services);
+
+ return NodeFailer.badNode(services);
+ }
/**
* Selects the first parent host that: