diff options
author | HÃ¥kon Hallingstad <hakon@oath.com> | 2017-10-23 08:56:10 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-23 08:56:10 +0200 |
commit | d6ae51f8a026ff407926825cfb88b749e6e968ef (patch) | |
tree | 56836ee2e8b1098e786ba9dfa37160ac2fa98743 /node-repository/src/main/java | |
parent | 535c1ae687415540b2d5e727957665f02f898de7 (diff) | |
parent | cc17135ac684896538e14633b262639667caa463 (diff) |
Merge pull request #3817 from vespa-engine/hakonhall/add-node-metrics-in-node-repo
Add node metrics in node repo
Diffstat (limited to 'node-repository/src/main/java')
2 files changed, 135 insertions, 13 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 402dd4ffdf3..3ac5b496f56 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -1,17 +1,26 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.component.Version; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.provisioning.DockerHostCapacity; +import com.yahoo.vespa.orchestrator.HostNameNotFoundException; +import com.yahoo.vespa.orchestrator.Orchestrator; +import com.yahoo.vespa.orchestrator.status.HostStatus; import java.time.Duration; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; /** @@ -20,22 +29,138 @@ import java.util.stream.Collectors; public class MetricsReporter extends Maintainer { private final Metric metric; - private final HashMap<Flavor, Metric.Context> contextMap = new HashMap<>(); + private final Orchestrator orchestrator; + private final Map<Map<String, String>, Metric.Context> contextMap = new HashMap<>(); - public MetricsReporter(NodeRepository nodeRepository, Metric metric, Duration interval, JobControl jobControl) { + public MetricsReporter(NodeRepository nodeRepository, + Metric metric, + Orchestrator orchestrator, + Duration interval, + JobControl jobControl) { super(nodeRepository, interval, jobControl); this.metric = metric; + this.orchestrator = orchestrator; } @Override public void maintain() { + List<Node> nodes = nodeRepository().getNodes(); + nodes.forEach(this::updateNodeMetrics); + updateStateMetrics(nodes); + updateDockerMetrics(nodes); + } + + private void updateNodeMetrics(Node node) { + // Dimensions automatically added: host, vespaVersion, zone, role, and colo. + // 'vespaVersion' is the vespaVersion for the config server and not related + // to the node we're making metric for now. + + Metric.Context context; + + Optional<Allocation> allocation = node.allocation(); + if (allocation.isPresent()) { + ApplicationId applicationId = allocation.get().owner(); + context = getContextAt( + "state", node.state().name(), + "hostname", node.hostname(), + "tenantName", applicationId.tenant().value(), + "applicationId", applicationId.serializedForm().replace(':', '.'), + "clustertype", allocation.get().membership().cluster().type().name(), + "clusterid", allocation.get().membership().cluster().id().value()); + + long wantedRestartGeneration = allocation.get().restartGeneration().wanted(); + metric.set("wantedRestartGeneration", wantedRestartGeneration, context); + long currentRestartGeneration = allocation.get().restartGeneration().wanted(); + metric.set("currentRestartGeneration", currentRestartGeneration, context); + boolean wantToRestart = currentRestartGeneration < wantedRestartGeneration; + metric.set("wantToRestart", wantToRestart ? 1 : 0, context); + + Version wantedVersion = allocation.get().membership().cluster().vespaVersion(); + double wantedVersionNumber = getVersionAsNumber(wantedVersion); + metric.set("wantedVespaVersion", wantedVersionNumber, context); + + Optional<Version> currentVersion = node.status().vespaVersion(); + boolean converged = currentVersion.isPresent() && + currentVersion.get().equals(wantedVersion); + metric.set("wantToChangeVespaVersion", converged ? 0 : 1, context); + } else { + context = getContextAt( + "state", node.state().name(), + "hostname", node.hostname()); + } + + Optional<Version> currentVersion = node.status().vespaVersion(); + // Node repo checks for !isEmpty(), so let's do that here too. + if (currentVersion.isPresent() && !currentVersion.get().isEmpty()) { + double currentVersionNumber = getVersionAsNumber(currentVersion.get()); + metric.set("currentVespaVersion", currentVersionNumber, context); + } + + long wantedRebootGeneration = node.status().reboot().wanted(); + metric.set("wantedRebootGeneration", wantedRebootGeneration, context); + long currentRebootGeneration = node.status().reboot().current(); + metric.set("currentRebootGeneration", currentRebootGeneration, context); + boolean wantToReboot = currentRebootGeneration < wantedRebootGeneration; + metric.set("wantToReboot", wantToReboot ? 1 : 0, context); + + metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context); + metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context); + + try { + HostStatus status = orchestrator.getNodeStatus(new HostName(node.hostname())); + boolean allowedToBeDown = status == HostStatus.ALLOWED_TO_BE_DOWN; + metric.set("allowedToBeDown", allowedToBeDown ? 1 : 0, context); + } catch (HostNameNotFoundException e) { + // Ignore + } + + // TODO: Also add metric on whether some services are down on node? + } + + /** + * A version 6.163.20 will be returned as a number 163.020. The major + * version can normally be inferred. As long as the micro version stays + * below 1000 these numbers sort like Version. + */ + private static double getVersionAsNumber(Version version) { + return version.getMinor() + version.getMicro() / 1000.0; + } + + private Metric.Context getContextAt(String... point) { + if (point.length % 2 != 0) { + throw new IllegalArgumentException("Dimension specification comes in pairs"); + } + + Map<String, String> dimensions = new HashMap<>(); + for (int i = 0; i < point.length; i += 2) { + dimensions.put(point[i], point[i + 1]); + } + + Metric.Context context = contextMap.get(dimensions); + if (context != null) { + return context; + } + + context = metric.createContext(dimensions); + contextMap.put(dimensions, context); + return context; + } + + private void updateStateMetrics(List<Node> nodes) { + Map<Node.State, List<Node>> nodesByState = nodes.stream() + .collect(Collectors.groupingBy(Node::state)); + // Metrics pr state - for (Node.State state : Node.State.values()) - metric.set("hostedVespa." + state.name() + "Hosts", - nodeRepository().getNodes(NodeType.tenant, state).size(), null); + for (Node.State state : Node.State.values()) { + List<Node> nodesInState = nodesByState.getOrDefault(state, new ArrayList<>()); + long size = nodesInState.stream().filter(node -> node.type() == NodeType.tenant).count(); + metric.set("hostedVespa." + state.name() + "Hosts", size, null); + } + } + private void updateDockerMetrics(List<Node> nodes) { // Capacity flavors for docker - DockerHostCapacity capacity = new DockerHostCapacity(nodeRepository().getNodes(Node.State.values())); + DockerHostCapacity capacity = new DockerHostCapacity(nodes); metric.set("hostedVespa.docker.totalCapacityCpu", capacity.getCapacityTotal().getCpu(), null); metric.set("hostedVespa.docker.totalCapacityMem", capacity.getCapacityTotal().getMemory(), null); metric.set("hostedVespa.docker.totalCapacityDisk", capacity.getCapacityTotal().getDisk(), null); @@ -47,15 +172,12 @@ public class MetricsReporter extends Maintainer { .filter(f -> f.getType().equals(Flavor.Type.DOCKER_CONTAINER)) .collect(Collectors.toList()); for (Flavor flavor : dockerFlavors) { - if (!contextMap.containsKey(flavor)) { - Map<String, String> dimensions = new HashMap<>(); - dimensions.put("flavor", flavor.name()); - contextMap.put(flavor, metric.createContext(dimensions)); - } - Metric.Context context = contextMap.get(flavor); + Metric.Context context = getContextAt("flavor", flavor.name()); metric.set("hostedVespa.docker.freeCapacityFlavor", capacity.freeCapacityInFlavorEquivalence(flavor), context); metric.set("hostedVespa.docker.idealHeadroomFlavor", flavor.getIdealHeadroom(), context); metric.set("hostedVespa.docker.hostsAvailableFlavor", capacity.getNofHostsAvailableFor(flavor), context); } + + } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 2057fd7e36f..7305b91f317 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -76,7 +76,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { dirtyExpirer = new DirtyExpirer(nodeRepository, clock, durationFromEnv("dirty_expiry").orElse(defaults.dirtyExpiry), jobControl); provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, durationFromEnv("provisioned_expiry").orElse(defaults.provisionedExpiry), jobControl); nodeRebooter = new NodeRebooter(nodeRepository, clock, durationFromEnv("reboot_interval").orElse(defaults.rebootInterval), jobControl); - metricsReporter = new MetricsReporter(nodeRepository, metric, durationFromEnv("metrics_interval").orElse(defaults.metricsInterval), jobControl); + metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, durationFromEnv("metrics_interval").orElse(defaults.metricsInterval), jobControl); RetirementPolicy policy = new RetirementPolicyList(new RetireIPv4OnlyNodes(zone)); FlavorSpareChecker flavorSpareChecker = new FlavorSpareChecker( |