From aad3751786ae125f5f4bdc71b10ff77498e9454c Mon Sep 17 00:00:00 2001 From: Håkon Hallingstad Date: Thu, 19 Oct 2017 11:22:53 +0200 Subject: Add node metrics in node repo --- .../provision/maintenance/MetricsReporter.java | 146 +++++++++++++++++++-- .../maintenance/NodeRepositoryMaintenance.java | 2 +- .../provision/monitoring/MetricsReporterTest.java | 46 +++++-- 3 files changed, 167 insertions(+), 27 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 402dd4ffdf3..09af884004b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -1,17 +1,26 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.component.Version; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.provisioning.DockerHostCapacity; +import com.yahoo.vespa.orchestrator.HostNameNotFoundException; +import com.yahoo.vespa.orchestrator.Orchestrator; +import com.yahoo.vespa.orchestrator.status.HostStatus; import java.time.Duration; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; /** @@ -20,22 +29,138 @@ import java.util.stream.Collectors; public class MetricsReporter extends Maintainer { private final Metric metric; - private final HashMap contextMap = new HashMap<>(); + private final Orchestrator orchestrator; + private final Map, Metric.Context> contextMap = new HashMap<>(); - public MetricsReporter(NodeRepository nodeRepository, Metric metric, Duration interval, JobControl jobControl) { + public MetricsReporter(NodeRepository nodeRepository, + Metric metric, + Orchestrator orchestrator, + Duration interval, + JobControl jobControl) { super(nodeRepository, interval, jobControl); this.metric = metric; + this.orchestrator = orchestrator; } @Override public void maintain() { + List nodes = nodeRepository().getNodes(); + nodes.forEach(this::updateNodeMetrics); + updateStateMetrics(nodes); + updateDockerMetrics(nodes); + } + + private void updateNodeMetrics(Node node) { + // Dimensions automatically added: host, vespaVersion, zone, role, and colo. + // 'vespaVersion' is the vespaVersion for the config server and not related + // to the node we're making metric for now. + + Metric.Context context; + + Optional allocation = node.allocation(); + if (allocation.isPresent()) { + ApplicationId applicationId = allocation.get().owner(); + context = getContextAt( + "state", node.state().name(), + "hostname", node.hostname(), + "tenantName", applicationId.tenant().value(), + "applicationId", applicationId.serializedForm().replace(':', '.'), + "clustertype", allocation.get().membership().cluster().type().name(), + "clusterid", allocation.get().membership().cluster().id().value()); + + long wantedRestartGeneration = allocation.get().restartGeneration().wanted(); + metric.set("wantedRestartGeneration", wantedRestartGeneration, context); + long currentRestartGeneration = allocation.get().restartGeneration().wanted(); + metric.set("currentRestartGeneration", currentRestartGeneration, context); + boolean wantToRestart = currentRestartGeneration < wantedRestartGeneration; + metric.set("wantToRestart", wantToRestart ? 1 : 0, context); + + Version wantedVersion = allocation.get().membership().cluster().vespaVersion(); + double wantedVersionNumber = getVersionAsNumber(wantedVersion); + metric.set("wantedVersion", wantedVersionNumber, context); + + Optional currentVersion = node.status().vespaVersion(); + boolean converged = currentVersion.isPresent() && + currentVersion.get().equals(wantedVersion); + metric.set("wantToChangeVersion", converged ? 0 : 1, context); + } else { + context = getContextAt( + "state", node.state().name(), + "hostname", node.hostname()); + } + + Optional currentVersion = node.status().vespaVersion(); + // Node repo checks for !isEmpty(), so let's do that here too. + if (currentVersion.isPresent() && !currentVersion.get().isEmpty()) { + double currentVersionNumber = getVersionAsNumber(currentVersion.get()); + metric.set("currentVersion", currentVersionNumber, context); + } + + long wantedRebootGeneration = node.status().reboot().wanted(); + metric.set("wantedRebootGeneration", wantedRebootGeneration, context); + long currentRebootGeneration = node.status().reboot().current(); + metric.set("currentRebootGeneration", currentRebootGeneration, context); + boolean wantToReboot = currentRebootGeneration < wantedRebootGeneration; + metric.set("wantToReboot", wantToReboot ? 1 : 0, context); + + metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context); + metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context); + + try { + HostStatus status = orchestrator.getNodeStatus(new HostName(node.hostname())); + boolean allowedToBeDown = status == HostStatus.ALLOWED_TO_BE_DOWN; + metric.set("allowedToBeDown", allowedToBeDown ? 1 : 0, context); + } catch (HostNameNotFoundException e) { + // Ignore + } + + // TODO: Also add metric on whether some services are down on node? + } + + /** + * A version 6.163.20 will be returned as a number 163.020. The major + * version can normally be inferred. As long as the micro version stays + * below 1000 these numbers sort like Version. + */ + private static double getVersionAsNumber(Version version) { + return version.getMinor() + version.getMicro() / 1000.0; + } + + private Metric.Context getContextAt(String... point) { + if (point.length % 2 != 0) { + throw new IllegalArgumentException("Dimension specification comes in pairs"); + } + + Map dimensions = new HashMap<>(); + for (int i = 0; i < point.length; i += 2) { + dimensions.put(point[i], point[i + 1]); + } + + Metric.Context context = contextMap.get(dimensions); + if (context != null) { + return context; + } + + context = metric.createContext(dimensions); + contextMap.put(dimensions, context); + return context; + } + + private void updateStateMetrics(List nodes) { + Map> nodesByState = nodes.stream() + .collect(Collectors.groupingBy(Node::state)); + // Metrics pr state - for (Node.State state : Node.State.values()) - metric.set("hostedVespa." + state.name() + "Hosts", - nodeRepository().getNodes(NodeType.tenant, state).size(), null); + for (Node.State state : Node.State.values()) { + List nodesInState = nodesByState.getOrDefault(state, new ArrayList<>()); + long size = nodesInState.stream().filter(node -> node.type() == NodeType.tenant).count(); + metric.set("hostedVespa." + state.name() + "Hosts", size, null); + } + } + private void updateDockerMetrics(List nodes) { // Capacity flavors for docker - DockerHostCapacity capacity = new DockerHostCapacity(nodeRepository().getNodes(Node.State.values())); + DockerHostCapacity capacity = new DockerHostCapacity(nodes); metric.set("hostedVespa.docker.totalCapacityCpu", capacity.getCapacityTotal().getCpu(), null); metric.set("hostedVespa.docker.totalCapacityMem", capacity.getCapacityTotal().getMemory(), null); metric.set("hostedVespa.docker.totalCapacityDisk", capacity.getCapacityTotal().getDisk(), null); @@ -47,15 +172,12 @@ public class MetricsReporter extends Maintainer { .filter(f -> f.getType().equals(Flavor.Type.DOCKER_CONTAINER)) .collect(Collectors.toList()); for (Flavor flavor : dockerFlavors) { - if (!contextMap.containsKey(flavor)) { - Map dimensions = new HashMap<>(); - dimensions.put("flavor", flavor.name()); - contextMap.put(flavor, metric.createContext(dimensions)); - } - Metric.Context context = contextMap.get(flavor); + Metric.Context context = getContextAt("flavor", flavor.name()); metric.set("hostedVespa.docker.freeCapacityFlavor", capacity.freeCapacityInFlavorEquivalence(flavor), context); metric.set("hostedVespa.docker.idealHeadroomFlavor", flavor.getIdealHeadroom(), context); metric.set("hostedVespa.docker.hostsAvailableFlavor", capacity.getNofHostsAvailableFor(flavor), context); } + + } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 2057fd7e36f..7305b91f317 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -76,7 +76,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { dirtyExpirer = new DirtyExpirer(nodeRepository, clock, durationFromEnv("dirty_expiry").orElse(defaults.dirtyExpiry), jobControl); provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, durationFromEnv("provisioned_expiry").orElse(defaults.provisionedExpiry), jobControl); nodeRebooter = new NodeRebooter(nodeRepository, clock, durationFromEnv("reboot_interval").orElse(defaults.rebootInterval), jobControl); - metricsReporter = new MetricsReporter(nodeRepository, metric, durationFromEnv("metrics_interval").orElse(defaults.metricsInterval), jobControl); + metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, durationFromEnv("metrics_interval").orElse(defaults.metricsInterval), jobControl); RetirementPolicy policy = new RetirementPolicyList(new RetireIPv4OnlyNodes(zone)); FlavorSpareChecker flavorSpareChecker = new FlavorSpareChecker( diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java index a5228143e53..b40e1f4923c 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java @@ -19,6 +19,8 @@ import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.Generation; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; +import com.yahoo.vespa.orchestrator.Orchestrator; +import com.yahoo.vespa.orchestrator.status.HostStatus; import org.junit.Test; import java.time.Clock; @@ -33,6 +35,9 @@ import java.util.Optional; import java.util.Set; import static org.junit.Assert.assertEquals; +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; /** * @author oyving @@ -41,7 +46,7 @@ import static org.junit.Assert.assertEquals; public class MetricsReporterTest { @Test - public void test_registered_metric() throws InterruptedException { + public void test_registered_metric() throws Exception { NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default"); Curator curator = new MockCurator(); NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, Clock.systemUTC(), Zone.defaultZone(), @@ -53,14 +58,14 @@ public class MetricsReporterTest { nodeRepository.addNodes(Collections.singletonList(hostNode)); Map expectedMetrics = new HashMap<>(); - expectedMetrics.put("hostedVespa.provisionedHosts", 1); - expectedMetrics.put("hostedVespa.parkedHosts", 0); - expectedMetrics.put("hostedVespa.readyHosts", 0); - expectedMetrics.put("hostedVespa.reservedHosts", 0); - expectedMetrics.put("hostedVespa.activeHosts", 0); - expectedMetrics.put("hostedVespa.inactiveHosts", 0); - expectedMetrics.put("hostedVespa.dirtyHosts", 0); - expectedMetrics.put("hostedVespa.failedHosts", 0); + expectedMetrics.put("hostedVespa.provisionedHosts", 1L); + expectedMetrics.put("hostedVespa.parkedHosts", 0L); + expectedMetrics.put("hostedVespa.readyHosts", 0L); + expectedMetrics.put("hostedVespa.reservedHosts", 0L); + expectedMetrics.put("hostedVespa.activeHosts", 0L); + expectedMetrics.put("hostedVespa.inactiveHosts", 0L); + expectedMetrics.put("hostedVespa.dirtyHosts", 0L); + expectedMetrics.put("hostedVespa.failedHosts", 0L); expectedMetrics.put("hostedVespa.docker.totalCapacityDisk", 0.0); expectedMetrics.put("hostedVespa.docker.totalCapacityMem", 0.0); expectedMetrics.put("hostedVespa.docker.totalCapacityCpu", 0.0); @@ -68,15 +73,25 @@ public class MetricsReporterTest { expectedMetrics.put("hostedVespa.docker.freeCapacityMem", 0.0); expectedMetrics.put("hostedVespa.docker.freeCapacityCpu", 0.0); + expectedMetrics.put("wantedRebootGeneration", 0L); + expectedMetrics.put("currentRebootGeneration", 0L); + expectedMetrics.put("wantToReboot", 0); + expectedMetrics.put("wantToRetire", 0); + expectedMetrics.put("wantToDeprovision", 0); + expectedMetrics.put("allowedToBeDown", 0); + + Orchestrator orchestrator = mock(Orchestrator.class); + when(orchestrator.getNodeStatus(any())).thenReturn(HostStatus.NO_REMARKS); + TestMetric metric = new TestMetric(); - MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, Duration.ofMinutes(1), new JobControl(nodeRepository.database())); + MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, Duration.ofMinutes(1), new JobControl(nodeRepository.database())); metricsReporter.maintain(); assertEquals(expectedMetrics, metric.values); } @Test - public void docker_metrics() throws InterruptedException { + public void docker_metrics() throws Exception { NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("host", "docker", "docker2"); Curator curator = new MockCurator(); NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, Clock.systemUTC(), Zone.defaultZone(), @@ -103,12 +118,15 @@ public class MetricsReporterTest { container2 = container2.with(allocation(Optional.of("app2")).get()); nodeRepository.addDockerNodes(Collections.singletonList(container2)); + Orchestrator orchestrator = mock(Orchestrator.class); + when(orchestrator.getNodeStatus(any())).thenReturn(HostStatus.NO_REMARKS); + TestMetric metric = new TestMetric(); - MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, Duration.ofMinutes(1), new JobControl(nodeRepository.database())); + MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, Duration.ofMinutes(1), new JobControl(nodeRepository.database())); metricsReporter.maintain(); - assertEquals(0, metric.values.get("hostedVespa.readyHosts")); /** Only tenants counts **/ - assertEquals(2, metric.values.get("hostedVespa.reservedHosts")); + assertEquals(0L, metric.values.get("hostedVespa.readyHosts")); /** Only tenants counts **/ + assertEquals(2L, metric.values.get("hostedVespa.reservedHosts")); assertEquals(12.0, metric.values.get("hostedVespa.docker.totalCapacityDisk")); assertEquals(10.0, metric.values.get("hostedVespa.docker.totalCapacityMem")); -- cgit v1.2.3 From cc17135ac684896538e14633b262639667caa463 Mon Sep 17 00:00:00 2001 From: Håkon Hallingstad Date: Fri, 20 Oct 2017 13:27:09 +0200 Subject: Use VespaVersion instead of Version --- .../yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 09af884004b..3ac5b496f56 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -77,12 +77,12 @@ public class MetricsReporter extends Maintainer { Version wantedVersion = allocation.get().membership().cluster().vespaVersion(); double wantedVersionNumber = getVersionAsNumber(wantedVersion); - metric.set("wantedVersion", wantedVersionNumber, context); + metric.set("wantedVespaVersion", wantedVersionNumber, context); Optional currentVersion = node.status().vespaVersion(); boolean converged = currentVersion.isPresent() && currentVersion.get().equals(wantedVersion); - metric.set("wantToChangeVersion", converged ? 0 : 1, context); + metric.set("wantToChangeVespaVersion", converged ? 0 : 1, context); } else { context = getContextAt( "state", node.state().name(), @@ -93,7 +93,7 @@ public class MetricsReporter extends Maintainer { // Node repo checks for !isEmpty(), so let's do that here too. if (currentVersion.isPresent() && !currentVersion.get().isEmpty()) { double currentVersionNumber = getVersionAsNumber(currentVersion.get()); - metric.set("currentVersion", currentVersionNumber, context); + metric.set("currentVespaVersion", currentVersionNumber, context); } long wantedRebootGeneration = node.status().reboot().wanted(); -- cgit v1.2.3