summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHÃ¥kon Hallingstad <hakon@oath.com>2017-10-23 08:56:10 +0200
committerGitHub <noreply@github.com>2017-10-23 08:56:10 +0200
commitd6ae51f8a026ff407926825cfb88b749e6e968ef (patch)
tree56836ee2e8b1098e786ba9dfa37160ac2fa98743
parent535c1ae687415540b2d5e727957665f02f898de7 (diff)
parentcc17135ac684896538e14633b262639667caa463 (diff)
Merge pull request #3817 from vespa-engine/hakonhall/add-node-metrics-in-node-repo
Add node metrics in node repo
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java146
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java46
3 files changed, 167 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 402dd4ffdf3..3ac5b496f56 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -1,17 +1,26 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import com.yahoo.component.Version;
+import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeType;
import com.yahoo.jdisc.Metric;
+import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.provisioning.DockerHostCapacity;
+import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
+import com.yahoo.vespa.orchestrator.Orchestrator;
+import com.yahoo.vespa.orchestrator.status.HostStatus;
import java.time.Duration;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
import java.util.stream.Collectors;
/**
@@ -20,22 +29,138 @@ import java.util.stream.Collectors;
public class MetricsReporter extends Maintainer {
private final Metric metric;
- private final HashMap<Flavor, Metric.Context> contextMap = new HashMap<>();
+ private final Orchestrator orchestrator;
+ private final Map<Map<String, String>, Metric.Context> contextMap = new HashMap<>();
- public MetricsReporter(NodeRepository nodeRepository, Metric metric, Duration interval, JobControl jobControl) {
+ public MetricsReporter(NodeRepository nodeRepository,
+ Metric metric,
+ Orchestrator orchestrator,
+ Duration interval,
+ JobControl jobControl) {
super(nodeRepository, interval, jobControl);
this.metric = metric;
+ this.orchestrator = orchestrator;
}
@Override
public void maintain() {
+ List<Node> nodes = nodeRepository().getNodes();
+ nodes.forEach(this::updateNodeMetrics);
+ updateStateMetrics(nodes);
+ updateDockerMetrics(nodes);
+ }
+
+ private void updateNodeMetrics(Node node) {
+ // Dimensions automatically added: host, vespaVersion, zone, role, and colo.
+ // 'vespaVersion' is the vespaVersion for the config server and not related
+ // to the node we're making metric for now.
+
+ Metric.Context context;
+
+ Optional<Allocation> allocation = node.allocation();
+ if (allocation.isPresent()) {
+ ApplicationId applicationId = allocation.get().owner();
+ context = getContextAt(
+ "state", node.state().name(),
+ "hostname", node.hostname(),
+ "tenantName", applicationId.tenant().value(),
+ "applicationId", applicationId.serializedForm().replace(':', '.'),
+ "clustertype", allocation.get().membership().cluster().type().name(),
+ "clusterid", allocation.get().membership().cluster().id().value());
+
+ long wantedRestartGeneration = allocation.get().restartGeneration().wanted();
+ metric.set("wantedRestartGeneration", wantedRestartGeneration, context);
+ long currentRestartGeneration = allocation.get().restartGeneration().wanted();
+ metric.set("currentRestartGeneration", currentRestartGeneration, context);
+ boolean wantToRestart = currentRestartGeneration < wantedRestartGeneration;
+ metric.set("wantToRestart", wantToRestart ? 1 : 0, context);
+
+ Version wantedVersion = allocation.get().membership().cluster().vespaVersion();
+ double wantedVersionNumber = getVersionAsNumber(wantedVersion);
+ metric.set("wantedVespaVersion", wantedVersionNumber, context);
+
+ Optional<Version> currentVersion = node.status().vespaVersion();
+ boolean converged = currentVersion.isPresent() &&
+ currentVersion.get().equals(wantedVersion);
+ metric.set("wantToChangeVespaVersion", converged ? 0 : 1, context);
+ } else {
+ context = getContextAt(
+ "state", node.state().name(),
+ "hostname", node.hostname());
+ }
+
+ Optional<Version> currentVersion = node.status().vespaVersion();
+ // Node repo checks for !isEmpty(), so let's do that here too.
+ if (currentVersion.isPresent() && !currentVersion.get().isEmpty()) {
+ double currentVersionNumber = getVersionAsNumber(currentVersion.get());
+ metric.set("currentVespaVersion", currentVersionNumber, context);
+ }
+
+ long wantedRebootGeneration = node.status().reboot().wanted();
+ metric.set("wantedRebootGeneration", wantedRebootGeneration, context);
+ long currentRebootGeneration = node.status().reboot().current();
+ metric.set("currentRebootGeneration", currentRebootGeneration, context);
+ boolean wantToReboot = currentRebootGeneration < wantedRebootGeneration;
+ metric.set("wantToReboot", wantToReboot ? 1 : 0, context);
+
+ metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context);
+ metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context);
+
+ try {
+ HostStatus status = orchestrator.getNodeStatus(new HostName(node.hostname()));
+ boolean allowedToBeDown = status == HostStatus.ALLOWED_TO_BE_DOWN;
+ metric.set("allowedToBeDown", allowedToBeDown ? 1 : 0, context);
+ } catch (HostNameNotFoundException e) {
+ // Ignore
+ }
+
+ // TODO: Also add metric on whether some services are down on node?
+ }
+
+ /**
+ * A version 6.163.20 will be returned as a number 163.020. The major
+ * version can normally be inferred. As long as the micro version stays
+ * below 1000 these numbers sort like Version.
+ */
+ private static double getVersionAsNumber(Version version) {
+ return version.getMinor() + version.getMicro() / 1000.0;
+ }
+
+ private Metric.Context getContextAt(String... point) {
+ if (point.length % 2 != 0) {
+ throw new IllegalArgumentException("Dimension specification comes in pairs");
+ }
+
+ Map<String, String> dimensions = new HashMap<>();
+ for (int i = 0; i < point.length; i += 2) {
+ dimensions.put(point[i], point[i + 1]);
+ }
+
+ Metric.Context context = contextMap.get(dimensions);
+ if (context != null) {
+ return context;
+ }
+
+ context = metric.createContext(dimensions);
+ contextMap.put(dimensions, context);
+ return context;
+ }
+
+ private void updateStateMetrics(List<Node> nodes) {
+ Map<Node.State, List<Node>> nodesByState = nodes.stream()
+ .collect(Collectors.groupingBy(Node::state));
+
// Metrics pr state
- for (Node.State state : Node.State.values())
- metric.set("hostedVespa." + state.name() + "Hosts",
- nodeRepository().getNodes(NodeType.tenant, state).size(), null);
+ for (Node.State state : Node.State.values()) {
+ List<Node> nodesInState = nodesByState.getOrDefault(state, new ArrayList<>());
+ long size = nodesInState.stream().filter(node -> node.type() == NodeType.tenant).count();
+ metric.set("hostedVespa." + state.name() + "Hosts", size, null);
+ }
+ }
+ private void updateDockerMetrics(List<Node> nodes) {
// Capacity flavors for docker
- DockerHostCapacity capacity = new DockerHostCapacity(nodeRepository().getNodes(Node.State.values()));
+ DockerHostCapacity capacity = new DockerHostCapacity(nodes);
metric.set("hostedVespa.docker.totalCapacityCpu", capacity.getCapacityTotal().getCpu(), null);
metric.set("hostedVespa.docker.totalCapacityMem", capacity.getCapacityTotal().getMemory(), null);
metric.set("hostedVespa.docker.totalCapacityDisk", capacity.getCapacityTotal().getDisk(), null);
@@ -47,15 +172,12 @@ public class MetricsReporter extends Maintainer {
.filter(f -> f.getType().equals(Flavor.Type.DOCKER_CONTAINER))
.collect(Collectors.toList());
for (Flavor flavor : dockerFlavors) {
- if (!contextMap.containsKey(flavor)) {
- Map<String, String> dimensions = new HashMap<>();
- dimensions.put("flavor", flavor.name());
- contextMap.put(flavor, metric.createContext(dimensions));
- }
- Metric.Context context = contextMap.get(flavor);
+ Metric.Context context = getContextAt("flavor", flavor.name());
metric.set("hostedVespa.docker.freeCapacityFlavor", capacity.freeCapacityInFlavorEquivalence(flavor), context);
metric.set("hostedVespa.docker.idealHeadroomFlavor", flavor.getIdealHeadroom(), context);
metric.set("hostedVespa.docker.hostsAvailableFlavor", capacity.getNofHostsAvailableFor(flavor), context);
}
+
+
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 2057fd7e36f..7305b91f317 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -76,7 +76,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
dirtyExpirer = new DirtyExpirer(nodeRepository, clock, durationFromEnv("dirty_expiry").orElse(defaults.dirtyExpiry), jobControl);
provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, durationFromEnv("provisioned_expiry").orElse(defaults.provisionedExpiry), jobControl);
nodeRebooter = new NodeRebooter(nodeRepository, clock, durationFromEnv("reboot_interval").orElse(defaults.rebootInterval), jobControl);
- metricsReporter = new MetricsReporter(nodeRepository, metric, durationFromEnv("metrics_interval").orElse(defaults.metricsInterval), jobControl);
+ metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, durationFromEnv("metrics_interval").orElse(defaults.metricsInterval), jobControl);
RetirementPolicy policy = new RetirementPolicyList(new RetireIPv4OnlyNodes(zone));
FlavorSpareChecker flavorSpareChecker = new FlavorSpareChecker(
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java
index a5228143e53..b40e1f4923c 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java
@@ -19,6 +19,8 @@ import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.Generation;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder;
import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver;
+import com.yahoo.vespa.orchestrator.Orchestrator;
+import com.yahoo.vespa.orchestrator.status.HostStatus;
import org.junit.Test;
import java.time.Clock;
@@ -33,6 +35,9 @@ import java.util.Optional;
import java.util.Set;
import static org.junit.Assert.assertEquals;
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
/**
* @author oyving
@@ -41,7 +46,7 @@ import static org.junit.Assert.assertEquals;
public class MetricsReporterTest {
@Test
- public void test_registered_metric() throws InterruptedException {
+ public void test_registered_metric() throws Exception {
NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default");
Curator curator = new MockCurator();
NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, Clock.systemUTC(), Zone.defaultZone(),
@@ -53,14 +58,14 @@ public class MetricsReporterTest {
nodeRepository.addNodes(Collections.singletonList(hostNode));
Map<String, Number> expectedMetrics = new HashMap<>();
- expectedMetrics.put("hostedVespa.provisionedHosts", 1);
- expectedMetrics.put("hostedVespa.parkedHosts", 0);
- expectedMetrics.put("hostedVespa.readyHosts", 0);
- expectedMetrics.put("hostedVespa.reservedHosts", 0);
- expectedMetrics.put("hostedVespa.activeHosts", 0);
- expectedMetrics.put("hostedVespa.inactiveHosts", 0);
- expectedMetrics.put("hostedVespa.dirtyHosts", 0);
- expectedMetrics.put("hostedVespa.failedHosts", 0);
+ expectedMetrics.put("hostedVespa.provisionedHosts", 1L);
+ expectedMetrics.put("hostedVespa.parkedHosts", 0L);
+ expectedMetrics.put("hostedVespa.readyHosts", 0L);
+ expectedMetrics.put("hostedVespa.reservedHosts", 0L);
+ expectedMetrics.put("hostedVespa.activeHosts", 0L);
+ expectedMetrics.put("hostedVespa.inactiveHosts", 0L);
+ expectedMetrics.put("hostedVespa.dirtyHosts", 0L);
+ expectedMetrics.put("hostedVespa.failedHosts", 0L);
expectedMetrics.put("hostedVespa.docker.totalCapacityDisk", 0.0);
expectedMetrics.put("hostedVespa.docker.totalCapacityMem", 0.0);
expectedMetrics.put("hostedVespa.docker.totalCapacityCpu", 0.0);
@@ -68,15 +73,25 @@ public class MetricsReporterTest {
expectedMetrics.put("hostedVespa.docker.freeCapacityMem", 0.0);
expectedMetrics.put("hostedVespa.docker.freeCapacityCpu", 0.0);
+ expectedMetrics.put("wantedRebootGeneration", 0L);
+ expectedMetrics.put("currentRebootGeneration", 0L);
+ expectedMetrics.put("wantToReboot", 0);
+ expectedMetrics.put("wantToRetire", 0);
+ expectedMetrics.put("wantToDeprovision", 0);
+ expectedMetrics.put("allowedToBeDown", 0);
+
+ Orchestrator orchestrator = mock(Orchestrator.class);
+ when(orchestrator.getNodeStatus(any())).thenReturn(HostStatus.NO_REMARKS);
+
TestMetric metric = new TestMetric();
- MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, Duration.ofMinutes(1), new JobControl(nodeRepository.database()));
+ MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, Duration.ofMinutes(1), new JobControl(nodeRepository.database()));
metricsReporter.maintain();
assertEquals(expectedMetrics, metric.values);
}
@Test
- public void docker_metrics() throws InterruptedException {
+ public void docker_metrics() throws Exception {
NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("host", "docker", "docker2");
Curator curator = new MockCurator();
NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, Clock.systemUTC(), Zone.defaultZone(),
@@ -103,12 +118,15 @@ public class MetricsReporterTest {
container2 = container2.with(allocation(Optional.of("app2")).get());
nodeRepository.addDockerNodes(Collections.singletonList(container2));
+ Orchestrator orchestrator = mock(Orchestrator.class);
+ when(orchestrator.getNodeStatus(any())).thenReturn(HostStatus.NO_REMARKS);
+
TestMetric metric = new TestMetric();
- MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, Duration.ofMinutes(1), new JobControl(nodeRepository.database()));
+ MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, Duration.ofMinutes(1), new JobControl(nodeRepository.database()));
metricsReporter.maintain();
- assertEquals(0, metric.values.get("hostedVespa.readyHosts")); /** Only tenants counts **/
- assertEquals(2, metric.values.get("hostedVespa.reservedHosts"));
+ assertEquals(0L, metric.values.get("hostedVespa.readyHosts")); /** Only tenants counts **/
+ assertEquals(2L, metric.values.get("hostedVespa.reservedHosts"));
assertEquals(12.0, metric.values.get("hostedVespa.docker.totalCapacityDisk"));
assertEquals(10.0, metric.values.get("hostedVespa.docker.totalCapacityMem"));