summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-05-20 19:19:27 +0200
committerGitHub <noreply@github.com>2023-05-20 19:19:27 +0200
commit932b3cecfcc71408c9fe25bd23df992737e516a1 (patch)
tree986270b1e8c4850b2f33729a7b8d941aef5bf31d
parent6d515d0fd6405b2ec322d59d949db47920824b8a (diff)
parente49196cede3117a3622eccc983463912fbed63f7 (diff)
Merge pull request #27152 from vespa-engine/bratseth/cluster-load-metrics
Add cluster load metrics MERGEOK
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java5
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java4
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java8
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/Unit.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java15
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java35
6 files changed, 56 insertions, 14 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
index 362bc7b0964..a60d4d45317 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
@@ -131,6 +131,11 @@ public class VespaMetricSet {
addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS.last());
addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS.last());
+ addMetric(metrics, ConfigServerMetrics.CLUSTER_COST.last());
+ addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.last());
+ addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.last());
+ addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.last());
+
return metrics;
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index b791c843357..2728249333e 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -57,7 +57,7 @@ public class SimpleTokenizer implements Tokenizer {
}
/** Tokenize the input, and apply the given transform to each token string. */
- public Iterable<Token> tokenize(String input, Function<String, String> tokenProocessor) {
+ public Iterable<Token> tokenize(String input, Function<String, String> tokenProcessor) {
if (input.isEmpty()) return List.of();
List<Token> tokens = new ArrayList<>();
@@ -71,7 +71,7 @@ public class SimpleTokenizer implements Tokenizer {
String original = input.substring(prev, next);
tokens.add(new SimpleToken(original).setOffset(prev)
.setType(tokenType)
- .setTokenString(tokenProocessor.apply(original)));
+ .setTokenString(tokenProcessor.apply(original)));
prev = next;
prevType = nextType;
tokenType = prevType;
diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
index 013c50e77cf..d323026e4ca 100644
--- a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
+++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
@@ -27,7 +27,6 @@ public enum ConfigServerMetrics implements VespaMetrics {
MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE("maintenanceDeployment.transientFailure", Unit.OPERATION, "Number of maintenance deployments that failed with a transient failure"),
MAINTENANCE_DEPLOYMENT_FAILURE("maintenanceDeployment.failure", Unit.OPERATION, "Number of maintenance deployments that failed with a permanent failure"),
-
// ZooKeeper related metrics
ZK_CONNECTIONS_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"),
ZK_RECONNECTED("configserver.zkReconnected", Unit.CONNECTION, "Number of ZooKeeper reconnections"),
@@ -45,8 +44,13 @@ public enum ConfigServerMetrics implements VespaMetrics {
ORCHESTRATOR_LOCK_ACQUIRE_TIMEOUT("orchestrator.lock.acquire-timedout", Unit.OPERATION, "Number of times zookeeper lock couldn't be acquired within timeout"),
ORCHESTRATOR_LOCK_ACQUIRE("orchestrator.lock.acquire", Unit.OPERATION, "Number of attempts to acquire zookeeper lock"),
ORCHESTRATOR_LOCK_ACQUIRED("orchestrator.lock.acquired", Unit.OPERATION, "Number of times zookeeper lock was acquired"),
- ORCHESTRATOR_LOCK_HOLD_LATENCY("orchestrator.lock.hold-latency", Unit.SECOND, "Time zookeeper lock was held before it was released");
+ ORCHESTRATOR_LOCK_HOLD_LATENCY("orchestrator.lock.hold-latency", Unit.SECOND, "Time zookeeper lock was held before it was released"),
+ // Node repository metrics
+ CLUSTER_COST("cluster.cost", Unit.DOLLAR_PER_HOUR, "The cost of the nodes allocated to a certain cluster, in $/hr"),
+ CLUSTER_LOAD_IDEAL_CPU("cluster.load.ideal.cpu", Unit.FRACTION, "The ideal cpu load of a certain cluster"),
+ CLUSTER_LOAD_IDEAL_MEMORY("cluster.load.ideal.memory", Unit.FRACTION, "The ideal memory load of a certain cluster"),
+ CLUSTER_LOAD_IDEAL_DISK("cluster.load.ideal.disk", Unit.FRACTION, "The ideal disk load of a certain cluster");
private final String name;
private final Unit unit;
diff --git a/metrics/src/main/java/ai/vespa/metrics/Unit.java b/metrics/src/main/java/ai/vespa/metrics/Unit.java
index a2123d72246..ee6ea569fc4 100644
--- a/metrics/src/main/java/ai/vespa/metrics/Unit.java
+++ b/metrics/src/main/java/ai/vespa/metrics/Unit.java
@@ -12,6 +12,7 @@ public enum Unit {
CONNECTION(BaseUnit.CONNECTION, "A link used for communication between a client and a server"),
DOCUMENT(BaseUnit.DOCUMENT, "Vespa document, a collection of fields defined in a schema file"),
DOCUMENTID(BaseUnit.DOCUMENTID, "A unique document identifier"),
+ DOLLAR_PER_HOUR(BaseUnit.DOLLAR, BaseUnit.HOUR, "Total current cost of the cluster in $/hr"),
FAILURE(BaseUnit.FAILURE, "Failures, typically for requests, operations or nodes"),
FILE(BaseUnit.FILE, "Data file stored on the disk on a node"),
FRACTION(BaseUnit.FRACTION, "A value in the range [0..1]. Higher values can occur for some metrics, but would indicate the value is outside of the allowed range."),
@@ -80,10 +81,12 @@ public enum Unit {
CONNECTION("connection"),
DOCUMENT("document"),
DOCUMENTID("documentid"),
+ DOLLAR("dollar"),
FAILURE("failure"),
FILE("file"),
FRACTION("fraction"),
HIT("hit"),
+ HOUR("hour"),
INSTANCE("instance"),
ITEM("item"),
MILLISECOND("millisecond", "ms"),
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 4f94f0fab53..f01f5a30870 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.collections.Pair;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
@@ -18,6 +19,7 @@ import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.Node.State;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.ClusterId;
import com.yahoo.vespa.hosted.provision.persistence.CacheStats;
@@ -118,7 +120,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
byCluster.forEach((clusterId, clusterNodes) -> {
Metric.Context context = getContext(dimensions(clusterId.application(), clusterId.cluster()));
updateExclusiveSwitchMetrics(clusterNodes, nodes, context);
- updateClusterCostMetrics(clusterNodes, context);
+ updateClusterCostMetrics(clusterId, clusterNodes, context);
});
}
@@ -129,9 +131,16 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
metric.set("nodes.exclusiveSwitchFraction", exclusiveSwitchRatio,context);
}
- private void updateClusterCostMetrics(List<Node> clusterNodes, Metric.Context context) {
+ private void updateClusterCostMetrics(ClusterId clusterId,
+ List<Node> clusterNodes, Metric.Context context) {
+ var cluster = nodeRepository().applications().get(clusterId.application())
+ .flatMap(application -> application.cluster(clusterId.cluster()));
+ if (cluster.isEmpty()) return;
double cost = clusterNodes.stream().mapToDouble(node -> node.resources().cost()).sum();
- metric.set("cluster.cost", cost, context);
+ metric.set(ConfigServerMetrics.CLUSTER_COST.baseName(), cost, context);
+ metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.baseName(), cluster.get().target().ideal().cpu(), context);
+ metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.baseName(), cluster.get().target().ideal().memory(), context);
+ metric.set(ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.baseName(), cluster.get().target().ideal().disk(), context);
}
private void updateZoneMetrics() {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
index 487355a0b75..de2c060a0eb 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
@@ -19,8 +19,11 @@ import com.yahoo.vespa.curator.stats.LockStats;
import com.yahoo.vespa.hosted.provision.LockedNodeList;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling;
+import com.yahoo.vespa.hosted.provision.autoscale.Load;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
+import com.yahoo.vespa.hosted.provision.node.ClusterId;
import com.yahoo.vespa.hosted.provision.node.Generation;
import com.yahoo.vespa.hosted.provision.node.IP;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder;
@@ -240,7 +243,7 @@ public class MetricsReporterTest {
}
@Test
- public void non_active_metric() {
+ public void node_and_cluster_metrics() {
ProvisioningTester tester = new ProvisioningTester.Builder().build();
tester.makeReadyHosts(5, new NodeResources(64, 256, 2000, 10));
tester.activateTenantHosts();
@@ -248,18 +251,36 @@ public class MetricsReporterTest {
MetricsReporter metricsReporter = metricsReporter(metric, tester);
// Application is deployed
- ApplicationId application = ApplicationId.from("t1", "a1", "default");
- Map<String, String> dimensions = Map.of("applicationId", application.toFullString());
+ ApplicationId applicationId = ApplicationId.from("t1", "a1", "default");
+ ClusterSpec clusterSpec = ProvisioningTester.contentClusterSpec();
NodeResources resources = new NodeResources(2, 8, 100, 1);
- List<Node> activeNodes = tester.deploy(application, ProvisioningTester.contentClusterSpec(), Capacity.from(new ClusterResources(4, 1, resources)));
+ Capacity capacity = Capacity.from(new ClusterResources(4, 1, resources));
+
+ List<Node> activeNodes = tester.deploy(applicationId, clusterSpec, capacity);
+ var application = tester.nodeRepository().applications().require(applicationId);
+ application = application.withCluster(clusterSpec.id(), false, capacity);
+ var cluster = application.cluster(clusterSpec.id()).get().withTarget(new Autoscaling(Autoscaling.Status.ideal,
+ "test",
+ Optional.empty(),
+ tester.clock().instant(),
+ Load.zero(),
+ new Load(0.1, 0.2, 0.3),
+ Autoscaling.Metrics.zero()));
+ tester.nodeRepository().applications().put(application.with(cluster), tester.nodeRepository().applications().lock(applicationId));
+
metricsReporter.maintain();
+ Map<String, String> dimensions = Map.of("applicationId", applicationId.toFullString());
assertEquals(0D, getMetric("nodes.nonActiveFraction", metric, dimensions));
assertEquals(4, getMetric("nodes.active", metric, dimensions));
assertEquals(0, getMetric("nodes.nonActive", metric, dimensions));
- Map<String, String> clusterDimensions = Map.of("applicationId", application.toFullString(),
- "clusterid", ProvisioningTester.contentClusterSpec().id().value());
+
+ Map<String, String> clusterDimensions = Map.of("applicationId", applicationId.toFullString(),
+ "clusterid", clusterSpec.id().value());
assertEquals(1.392, getMetric("cluster.cost", metric, clusterDimensions));
+ assertEquals(0.1, getMetric("cluster.load.ideal.cpu", metric, clusterDimensions));
+ assertEquals(0.2, getMetric("cluster.load.ideal.memory", metric, clusterDimensions));
+ assertEquals(0.3, getMetric("cluster.load.ideal.disk", metric, clusterDimensions));
// One node fails
tester.fail(activeNodes.get(0).hostname());
@@ -269,7 +290,7 @@ public class MetricsReporterTest {
assertEquals(1, getMetric("nodes.nonActive", metric, dimensions));
// Cluster is removed
- tester.deactivate(application);
+ tester.deactivate(applicationId);
metricsReporter.maintain();
assertEquals(1D, getMetric("nodes.nonActiveFraction", metric, dimensions).doubleValue(), Double.MIN_VALUE);
assertEquals(0, getMetric("nodes.active", metric, dimensions));