diff options
5 files changed, 53 insertions, 12 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java index 362bc7b0964..a60d4d45317 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java @@ -131,6 +131,11 @@ public class VespaMetricSet { addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS.last()); addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_COST.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.last()); + return metrics; } diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java index 013c50e77cf..d323026e4ca 100644 --- a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java +++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java @@ -27,7 +27,6 @@ public enum ConfigServerMetrics implements VespaMetrics { MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE("maintenanceDeployment.transientFailure", Unit.OPERATION, "Number of maintenance deployments that failed with a transient failure"), MAINTENANCE_DEPLOYMENT_FAILURE("maintenanceDeployment.failure", Unit.OPERATION, "Number of maintenance deployments that failed with a permanent failure"), - // ZooKeeper related metrics ZK_CONNECTIONS_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"), ZK_RECONNECTED("configserver.zkReconnected", Unit.CONNECTION, "Number of ZooKeeper reconnections"), @@ -45,8 +44,13 @@ public enum ConfigServerMetrics implements VespaMetrics { ORCHESTRATOR_LOCK_ACQUIRE_TIMEOUT("orchestrator.lock.acquire-timedout", Unit.OPERATION, "Number of times zookeeper lock couldn't be acquired within timeout"), ORCHESTRATOR_LOCK_ACQUIRE("orchestrator.lock.acquire", Unit.OPERATION, "Number of attempts to acquire zookeeper lock"), ORCHESTRATOR_LOCK_ACQUIRED("orchestrator.lock.acquired", Unit.OPERATION, "Number of times zookeeper lock was acquired"), - ORCHESTRATOR_LOCK_HOLD_LATENCY("orchestrator.lock.hold-latency", Unit.SECOND, "Time zookeeper lock was held before it was released"); + ORCHESTRATOR_LOCK_HOLD_LATENCY("orchestrator.lock.hold-latency", Unit.SECOND, "Time zookeeper lock was held before it was released"), + // Node repository metrics + CLUSTER_COST("cluster.cost", Unit.DOLLAR_PER_HOUR, "The cost of the nodes allocated to a certain cluster, in $/hr"), + CLUSTER_LOAD_IDEAL_CPU("cluster.load.ideal.cpu", Unit.FRACTION, "The ideal cpu load of a certain cluster"), + CLUSTER_LOAD_IDEAL_MEMORY("cluster.load.ideal.memory", Unit.FRACTION, "The ideal memory load of a certain cluster"), + CLUSTER_LOAD_IDEAL_DISK("cluster.load.ideal.disk", Unit.FRACTION, "The ideal disk load of a certain cluster"); private final String name; private final Unit unit; diff --git a/metrics/src/main/java/ai/vespa/metrics/Unit.java b/metrics/src/main/java/ai/vespa/metrics/Unit.java index a2123d72246..d5769707b76 100644 --- a/metrics/src/main/java/ai/vespa/metrics/Unit.java +++ b/metrics/src/main/java/ai/vespa/metrics/Unit.java @@ -12,6 +12,7 @@ public enum Unit { CONNECTION(BaseUnit.CONNECTION, "A link used for communication between a client and a server"), DOCUMENT(BaseUnit.DOCUMENT, "Vespa document, a collection of fields defined in a schema file"), DOCUMENTID(BaseUnit.DOCUMENTID, "A unique document identifier"), + DOLLAR_PER_HOUR(BaseUnit.MAGNITUDE, "$/hr"), FAILURE(BaseUnit.FAILURE, "Failures, typically for requests, operations or nodes"), FILE(BaseUnit.FILE, "Data file stored on the disk on a node"), FRACTION(BaseUnit.FRACTION, "A value in the range [0..1]. Higher values can occur for some metrics, but would indicate the value is outside of the allowed range."), @@ -86,6 +87,7 @@ public enum Unit { HIT("hit"), INSTANCE("instance"), ITEM("item"), + MAGNITUDE("magnitude"), MILLISECOND("millisecond", "ms"), NANOSECOND("nanosecond", "ns"), NODE("node"), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 4f94f0fab53..f01f5a30870 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import ai.vespa.metrics.ConfigServerMetrics; import com.yahoo.collections.Pair; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; @@ -18,6 +19,7 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.Node.State; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.ClusterId; import com.yahoo.vespa.hosted.provision.persistence.CacheStats; @@ -118,7 +120,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { byCluster.forEach((clusterId, clusterNodes) -> { Metric.Context context = getContext(dimensions(clusterId.application(), clusterId.cluster())); updateExclusiveSwitchMetrics(clusterNodes, nodes, context); - updateClusterCostMetrics(clusterNodes, context); + updateClusterCostMetrics(clusterId, clusterNodes, context); }); } @@ -129,9 +131,16 @@ public class MetricsReporter extends NodeRepositoryMaintainer { metric.set("nodes.exclusiveSwitchFraction", exclusiveSwitchRatio,context); } - private void updateClusterCostMetrics(List<Node> clusterNodes, Metric.Context context) { + private void updateClusterCostMetrics(ClusterId clusterId, + List<Node> clusterNodes, Metric.Context context) { + var cluster = nodeRepository().applications().get(clusterId.application()) + .flatMap(application -> application.cluster(clusterId.cluster())); + if (cluster.isEmpty()) return; double cost = clusterNodes.stream().mapToDouble(node -> node.resources().cost()).sum(); - metric.set("cluster.cost", cost, context); + metric.set(ConfigServerMetrics.CLUSTER_COST.baseName(), cost, context); + metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.baseName(), cluster.get().target().ideal().cpu(), context); + metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.baseName(), cluster.get().target().ideal().memory(), context); + metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.baseName(), cluster.get().target().ideal().disk(), context); } private void updateZoneMetrics() { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index 487355a0b75..de2c060a0eb 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -19,8 +19,11 @@ import com.yahoo.vespa.curator.stats.LockStats; import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling; +import com.yahoo.vespa.hosted.provision.autoscale.Load; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Allocation; +import com.yahoo.vespa.hosted.provision.node.ClusterId; import com.yahoo.vespa.hosted.provision.node.Generation; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; @@ -240,7 +243,7 @@ public class MetricsReporterTest { } @Test - public void non_active_metric() { + public void node_and_cluster_metrics() { ProvisioningTester tester = new ProvisioningTester.Builder().build(); tester.makeReadyHosts(5, new NodeResources(64, 256, 2000, 10)); tester.activateTenantHosts(); @@ -248,18 +251,36 @@ public class MetricsReporterTest { MetricsReporter metricsReporter = metricsReporter(metric, tester); // Application is deployed - ApplicationId application = ApplicationId.from("t1", "a1", "default"); - Map<String, String> dimensions = Map.of("applicationId", application.toFullString()); + ApplicationId applicationId = ApplicationId.from("t1", "a1", "default"); + ClusterSpec clusterSpec = ProvisioningTester.contentClusterSpec(); NodeResources resources = new NodeResources(2, 8, 100, 1); - List<Node> activeNodes = tester.deploy(application, ProvisioningTester.contentClusterSpec(), Capacity.from(new ClusterResources(4, 1, resources))); + Capacity capacity = Capacity.from(new ClusterResources(4, 1, resources)); + + List<Node> activeNodes = tester.deploy(applicationId, clusterSpec, capacity); + var application = tester.nodeRepository().applications().require(applicationId); + application = application.withCluster(clusterSpec.id(), false, capacity); + var cluster = application.cluster(clusterSpec.id()).get().withTarget(new Autoscaling(Autoscaling.Status.ideal, + "test", + Optional.empty(), + tester.clock().instant(), + Load.zero(), + new Load(0.1, 0.2, 0.3), + Autoscaling.Metrics.zero())); + tester.nodeRepository().applications().put(application.with(cluster), tester.nodeRepository().applications().lock(applicationId)); + metricsReporter.maintain(); + Map<String, String> dimensions = Map.of("applicationId", applicationId.toFullString()); assertEquals(0D, getMetric("nodes.nonActiveFraction", metric, dimensions)); assertEquals(4, getMetric("nodes.active", metric, dimensions)); assertEquals(0, getMetric("nodes.nonActive", metric, dimensions)); - Map<String, String> clusterDimensions = Map.of("applicationId", application.toFullString(), - "clusterid", ProvisioningTester.contentClusterSpec().id().value()); + + Map<String, String> clusterDimensions = Map.of("applicationId", applicationId.toFullString(), + "clusterid", clusterSpec.id().value()); assertEquals(1.392, getMetric("cluster.cost", metric, clusterDimensions)); + assertEquals(0.1, getMetric("cluster.load.ideal.cpu", metric, clusterDimensions)); + assertEquals(0.2, getMetric("cluster.load.ideal.memory", metric, clusterDimensions)); + assertEquals(0.3, getMetric("cluster.load.ideal.disk", metric, clusterDimensions)); // One node fails tester.fail(activeNodes.get(0).hostname()); @@ -269,7 +290,7 @@ public class MetricsReporterTest { assertEquals(1, getMetric("nodes.nonActive", metric, dimensions)); // Cluster is removed - tester.deactivate(application); + tester.deactivate(applicationId); metricsReporter.maintain(); assertEquals(1D, getMetric("nodes.nonActiveFraction", metric, dimensions).doubleValue(), Double.MIN_VALUE); assertEquals(0, getMetric("nodes.active", metric, dimensions)); |