diff options
author | Jon Bratseth <bratseth@gmail.com> | 2023-05-20 19:19:27 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-20 19:19:27 +0200 |
commit | 932b3cecfcc71408c9fe25bd23df992737e516a1 (patch) | |
tree | 986270b1e8c4850b2f33729a7b8d941aef5bf31d | |
parent | 6d515d0fd6405b2ec322d59d949db47920824b8a (diff) | |
parent | e49196cede3117a3622eccc983463912fbed63f7 (diff) |
Merge pull request #27152 from vespa-engine/bratseth/cluster-load-metrics
Add cluster load metrics MERGEOK
6 files changed, 56 insertions, 14 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java index 362bc7b0964..a60d4d45317 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java @@ -131,6 +131,11 @@ public class VespaMetricSet { addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS.last()); addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_COST.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.last()); + return metrics; } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index b791c843357..2728249333e 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -57,7 +57,7 @@ public class SimpleTokenizer implements Tokenizer { } /** Tokenize the input, and apply the given transform to each token string. */ - public Iterable<Token> tokenize(String input, Function<String, String> tokenProocessor) { + public Iterable<Token> tokenize(String input, Function<String, String> tokenProcessor) { if (input.isEmpty()) return List.of(); List<Token> tokens = new ArrayList<>(); @@ -71,7 +71,7 @@ public class SimpleTokenizer implements Tokenizer { String original = input.substring(prev, next); tokens.add(new SimpleToken(original).setOffset(prev) .setType(tokenType) - .setTokenString(tokenProocessor.apply(original))); + .setTokenString(tokenProcessor.apply(original))); prev = next; prevType = nextType; tokenType = prevType; diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java index 013c50e77cf..d323026e4ca 100644 --- a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java +++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java @@ -27,7 +27,6 @@ public enum ConfigServerMetrics implements VespaMetrics { MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE("maintenanceDeployment.transientFailure", Unit.OPERATION, "Number of maintenance deployments that failed with a transient failure"), MAINTENANCE_DEPLOYMENT_FAILURE("maintenanceDeployment.failure", Unit.OPERATION, "Number of maintenance deployments that failed with a permanent failure"), - // ZooKeeper related metrics ZK_CONNECTIONS_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"), ZK_RECONNECTED("configserver.zkReconnected", Unit.CONNECTION, "Number of ZooKeeper reconnections"), @@ -45,8 +44,13 @@ public enum ConfigServerMetrics implements VespaMetrics { ORCHESTRATOR_LOCK_ACQUIRE_TIMEOUT("orchestrator.lock.acquire-timedout", Unit.OPERATION, "Number of times zookeeper lock couldn't be acquired within timeout"), ORCHESTRATOR_LOCK_ACQUIRE("orchestrator.lock.acquire", Unit.OPERATION, "Number of attempts to acquire zookeeper lock"), ORCHESTRATOR_LOCK_ACQUIRED("orchestrator.lock.acquired", Unit.OPERATION, "Number of times zookeeper lock was acquired"), - ORCHESTRATOR_LOCK_HOLD_LATENCY("orchestrator.lock.hold-latency", Unit.SECOND, "Time zookeeper lock was held before it was released"); + ORCHESTRATOR_LOCK_HOLD_LATENCY("orchestrator.lock.hold-latency", Unit.SECOND, "Time zookeeper lock was held before it was released"), + // Node repository metrics + CLUSTER_COST("cluster.cost", Unit.DOLLAR_PER_HOUR, "The cost of the nodes allocated to a certain cluster, in $/hr"), + CLUSTER_LOAD_IDEAL_CPU("cluster.load.ideal.cpu", Unit.FRACTION, "The ideal cpu load of a certain cluster"), + CLUSTER_LOAD_IDEAL_MEMORY("cluster.load.ideal.memory", Unit.FRACTION, "The ideal memory load of a certain cluster"), + CLUSTER_LOAD_IDEAL_DISK("cluster.load.ideal.disk", Unit.FRACTION, "The ideal disk load of a certain cluster"); private final String name; private final Unit unit; diff --git a/metrics/src/main/java/ai/vespa/metrics/Unit.java b/metrics/src/main/java/ai/vespa/metrics/Unit.java index a2123d72246..ee6ea569fc4 100644 --- a/metrics/src/main/java/ai/vespa/metrics/Unit.java +++ b/metrics/src/main/java/ai/vespa/metrics/Unit.java @@ -12,6 +12,7 @@ public enum Unit { CONNECTION(BaseUnit.CONNECTION, "A link used for communication between a client and a server"), DOCUMENT(BaseUnit.DOCUMENT, "Vespa document, a collection of fields defined in a schema file"), DOCUMENTID(BaseUnit.DOCUMENTID, "A unique document identifier"), + DOLLAR_PER_HOUR(BaseUnit.DOLLAR, BaseUnit.HOUR, "Total current cost of the cluster in $/hr"), FAILURE(BaseUnit.FAILURE, "Failures, typically for requests, operations or nodes"), FILE(BaseUnit.FILE, "Data file stored on the disk on a node"), FRACTION(BaseUnit.FRACTION, "A value in the range [0..1]. Higher values can occur for some metrics, but would indicate the value is outside of the allowed range."), @@ -80,10 +81,12 @@ public enum Unit { CONNECTION("connection"), DOCUMENT("document"), DOCUMENTID("documentid"), + DOLLAR("dollar"), FAILURE("failure"), FILE("file"), FRACTION("fraction"), HIT("hit"), + HOUR("hour"), INSTANCE("instance"), ITEM("item"), MILLISECOND("millisecond", "ms"), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 4f94f0fab53..f01f5a30870 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import ai.vespa.metrics.ConfigServerMetrics; import com.yahoo.collections.Pair; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; @@ -18,6 +19,7 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.Node.State; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.ClusterId; import com.yahoo.vespa.hosted.provision.persistence.CacheStats; @@ -118,7 +120,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { byCluster.forEach((clusterId, clusterNodes) -> { Metric.Context context = getContext(dimensions(clusterId.application(), clusterId.cluster())); updateExclusiveSwitchMetrics(clusterNodes, nodes, context); - updateClusterCostMetrics(clusterNodes, context); + updateClusterCostMetrics(clusterId, clusterNodes, context); }); } @@ -129,9 +131,16 @@ public class MetricsReporter extends NodeRepositoryMaintainer { metric.set("nodes.exclusiveSwitchFraction", exclusiveSwitchRatio,context); } - private void updateClusterCostMetrics(List<Node> clusterNodes, Metric.Context context) { + private void updateClusterCostMetrics(ClusterId clusterId, + List<Node> clusterNodes, Metric.Context context) { + var cluster = nodeRepository().applications().get(clusterId.application()) + .flatMap(application -> application.cluster(clusterId.cluster())); + if (cluster.isEmpty()) return; double cost = clusterNodes.stream().mapToDouble(node -> node.resources().cost()).sum(); - metric.set("cluster.cost", cost, context); + metric.set(ConfigServerMetrics.CLUSTER_COST.baseName(), cost, context); + metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.baseName(), cluster.get().target().ideal().cpu(), context); + metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.baseName(), cluster.get().target().ideal().memory(), context); + metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.baseName(), cluster.get().target().ideal().disk(), context); } private void updateZoneMetrics() { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index 487355a0b75..de2c060a0eb 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -19,8 +19,11 @@ import com.yahoo.vespa.curator.stats.LockStats; import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling; +import com.yahoo.vespa.hosted.provision.autoscale.Load; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Allocation; +import com.yahoo.vespa.hosted.provision.node.ClusterId; import com.yahoo.vespa.hosted.provision.node.Generation; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; @@ -240,7 +243,7 @@ public class MetricsReporterTest { } @Test - public void non_active_metric() { + public void node_and_cluster_metrics() { ProvisioningTester tester = new ProvisioningTester.Builder().build(); tester.makeReadyHosts(5, new NodeResources(64, 256, 2000, 10)); tester.activateTenantHosts(); @@ -248,18 +251,36 @@ public class MetricsReporterTest { MetricsReporter metricsReporter = metricsReporter(metric, tester); // Application is deployed - ApplicationId application = ApplicationId.from("t1", "a1", "default"); - Map<String, String> dimensions = Map.of("applicationId", application.toFullString()); + ApplicationId applicationId = ApplicationId.from("t1", "a1", "default"); + ClusterSpec clusterSpec = ProvisioningTester.contentClusterSpec(); NodeResources resources = new NodeResources(2, 8, 100, 1); - List<Node> activeNodes = tester.deploy(application, ProvisioningTester.contentClusterSpec(), Capacity.from(new ClusterResources(4, 1, resources))); + Capacity capacity = Capacity.from(new ClusterResources(4, 1, resources)); + + List<Node> activeNodes = tester.deploy(applicationId, clusterSpec, capacity); + var application = tester.nodeRepository().applications().require(applicationId); + application = application.withCluster(clusterSpec.id(), false, capacity); + var cluster = application.cluster(clusterSpec.id()).get().withTarget(new Autoscaling(Autoscaling.Status.ideal, + "test", + Optional.empty(), + tester.clock().instant(), + Load.zero(), + new Load(0.1, 0.2, 0.3), + Autoscaling.Metrics.zero())); + tester.nodeRepository().applications().put(application.with(cluster), tester.nodeRepository().applications().lock(applicationId)); + metricsReporter.maintain(); + Map<String, String> dimensions = Map.of("applicationId", applicationId.toFullString()); assertEquals(0D, getMetric("nodes.nonActiveFraction", metric, dimensions)); assertEquals(4, getMetric("nodes.active", metric, dimensions)); assertEquals(0, getMetric("nodes.nonActive", metric, dimensions)); - Map<String, String> clusterDimensions = Map.of("applicationId", application.toFullString(), - "clusterid", ProvisioningTester.contentClusterSpec().id().value()); + + Map<String, String> clusterDimensions = Map.of("applicationId", applicationId.toFullString(), + "clusterid", clusterSpec.id().value()); assertEquals(1.392, getMetric("cluster.cost", metric, clusterDimensions)); + assertEquals(0.1, getMetric("cluster.load.ideal.cpu", metric, clusterDimensions)); + assertEquals(0.2, getMetric("cluster.load.ideal.memory", metric, clusterDimensions)); + assertEquals(0.3, getMetric("cluster.load.ideal.disk", metric, clusterDimensions)); // One node fails tester.fail(activeNodes.get(0).hostname()); @@ -269,7 +290,7 @@ public class MetricsReporterTest { assertEquals(1, getMetric("nodes.nonActive", metric, dimensions)); // Cluster is removed - tester.deactivate(application); + tester.deactivate(applicationId); metricsReporter.maintain(); assertEquals(1D, getMetric("nodes.nonActiveFraction", metric, dimensions).doubleValue(), Double.MIN_VALUE); assertEquals(0, getMetric("nodes.active", metric, dimensions)); |