aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKristian Aune <kkraune@users.noreply.github.com>2023-01-23 13:25:51 +0100
committerGitHub <noreply@github.com>2023-01-23 13:25:51 +0100
commitd09223693dcd7e1445c3a5988fe66fb0818a47bc (patch)
treea7149699c99df14b710204654fef54d44fb4fe66
parent1e0590e752905d50735ae3bb968e559d085ddf01 (diff)
parente4b791ce79ee6c25d74153691117fe2a11144fb0 (diff)
Merge pull request #25678 from vespa-engine/yngveaasheim/use-enums-for-cluster-controller-metrics
Use enums for cluster controller metrics
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java46
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java9
-rw-r--r--container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java30
-rw-r--r--container-core/src/main/java/com/yahoo/metrics/Unit.java4
4 files changed, 60 insertions, 29 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
index 7d4c4129640..d73385831ff 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
@@ -237,32 +237,32 @@ public class VespaMetricSet {
private static Set<Metric> getClusterControllerMetrics() {
Set<Metric> metrics = new LinkedHashSet<>();
- metrics.add(new Metric("cluster-controller.down.count.last"));
- metrics.add(new Metric("cluster-controller.initializing.count.last"));
- metrics.add(new Metric("cluster-controller.maintenance.count.last"));
- metrics.add(new Metric("cluster-controller.retired.count.last"));
- metrics.add(new Metric("cluster-controller.stopping.count.last"));
- metrics.add(new Metric("cluster-controller.up.count.last"));
- metrics.add(new Metric("cluster-controller.cluster-state-change.count"));
- addMetric(metrics, "cluster-controller.busy-tick-time-ms", List.of("last", "max", "sum", "count"));
- addMetric(metrics, "cluster-controller.idle-tick-time-ms", List.of("last", "max", "sum", "count"));
-
- addMetric(metrics, "cluster-controller.work-ms", List.of("last", "sum", "count"));
-
- metrics.add(new Metric("cluster-controller.is-master.last"));
- metrics.add(new Metric("cluster-controller.remote-task-queue.size.last"));
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_DOWN_COUNT.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_INITIALIZING_COUNT.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_MAINTENANCE_COUNT.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RETIRED_COUNT.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_STOPPING_COUNT.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_UP_COUNT.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_CLUSTER_STATE_CHANGE_COUNT.baseName());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_BUSY_TICK_TIME_MS, EnumSet.of(last, max, sum, count));
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_IDLE_TICK_TIME_MS, EnumSet.of(last, max, sum, count));
+
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_WORK_MS, EnumSet.of(last, sum, count));
+
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_IS_MASTER.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_REMOTE_TASK_QUEUE_SIZE.last());
// TODO(hakonhall): Update this name once persistent "count" metrics has been implemented.
// DO NOT RELY ON THIS METRIC YET.
- metrics.add(new Metric("cluster-controller.node-event.count"));
-
- addMetric(metrics, "cluster-controller.resource_usage.nodes_above_limit", List.of("last", "max"));
- addMetric(metrics, "cluster-controller.resource_usage.max_memory_utilization", List.of("last", "max"));
- addMetric(metrics, "cluster-controller.resource_usage.max_disk_utilization", List.of("last", "max"));
- metrics.add(new Metric("cluster-controller.resource_usage.disk_limit.last"));
- metrics.add(new Metric("cluster-controller.resource_usage.memory_limit.last"));
-
- metrics.add(new Metric("reindexing.progress.last"));
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_NODE_EVENT_COUNT.baseName());
+
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_NODES_ABOVE_LIMIT, EnumSet.of(last, max));
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION, EnumSet.of(last, max));
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION, EnumSet.of(last, max));
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT.last());
+ addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_REINDEXING_PROGRESS.last());
+
return metrics;
}
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java
index c1bc1027690..98f01bada3f 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.config.server.metrics;
import ai.vespa.util.http.hc5.VespaHttpClientBuilder;
import com.yahoo.concurrent.DaemonThreadFactory;
+import com.yahoo.metrics.ContainerMetrics;
import com.yahoo.slime.ArrayTraverser;
import com.yahoo.slime.Cursor;
import com.yahoo.slime.Inspector;
@@ -145,12 +146,12 @@ public class ClusterDeploymentMetricsRetriever {
.ifPresent(docCount -> aggregator.get().addDocumentCount(docCount));
break;
case VESPA_CONTAINER_CLUSTERCONTROLLER:
- optionalDouble(values.field("cluster-controller.resource_usage.max_memory_utilization.max")).ifPresent(memoryUtil ->
+ optionalDouble(values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION.max())).ifPresent(memoryUtil ->
aggregator.get()
.addMemoryUsage(memoryUtil,
- values.field("cluster-controller.resource_usage.memory_limit.last").asDouble())
- .addDiskUsage(values.field("cluster-controller.resource_usage.max_disk_utilization.max").asDouble(),
- values.field("cluster-controller.resource_usage.disk_limit.last").asDouble()));
+ values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT.last()).asDouble())
+ .addDiskUsage(values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION.max()).asDouble(),
+ values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT.last()).asDouble()));
break;
}
}
diff --git a/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java b/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java
index 513cf499362..35feb291c84 100644
--- a/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java
+++ b/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java
@@ -150,8 +150,34 @@ public enum ContainerMetrics {
// Deprecated metrics. TODO: Remove on Vespa 9.
SERVER_REJECTED_REQUESTS("serverRejectedRequests", Unit.OPERATION, "Deprecated. Use jdisc.thread_pool.rejected_tasks instead."),
SERVER_THREAD_POOL_SIZE("serverThreadPoolSize", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.size instead."),
- SERVER_ACTIVE_THREADS("serverActiveThreads", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.active_threads instead.");
-
+ SERVER_ACTIVE_THREADS("serverActiveThreads", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.active_threads instead."),
+
+
+ // Metrics from the cluster controller
+ CLUSTER_CONTROLLER_DOWN_COUNT("cluster-controller.down.count", Unit.NODE, "Number of content nodes down"),
+ CLUSTER_CONTROLLER_INITIALIZING_COUNT("cluster-controller.initializing.count", Unit.NODE, "Number of content nodes initializing"),
+ CLUSTER_CONTROLLER_MAINTENANCE_COUNT("cluster-controller.maintenance.count", Unit.NODE, "Number of content nodes in maintenance"),
+ CLUSTER_CONTROLLER_RETIRED_COUNT("cluster-controller.retired.count", Unit.NODE, "Number of content nodes that are retired"),
+ CLUSTER_CONTROLLER_STOPPING_COUNT("cluster-controller.stopping.count", Unit.NODE, "Number of content nodes currently stopping"),
+ CLUSTER_CONTROLLER_UP_COUNT("cluster-controller.up.count", Unit.NODE, "Number of content nodes up"),
+ CLUSTER_CONTROLLER_CLUSTER_STATE_CHANGE_COUNT("cluster-controller.cluster-state-change.count", Unit.NODE, "Number of nodes changing state"),
+ CLUSTER_CONTROLLER_BUSY_TICK_TIME_MS("cluster-controller.busy-tick-time-ms", Unit.MILLISECOND, "Time busy"),
+ CLUSTER_CONTROLLER_IDLE_TICK_TIME_MS("cluster-controller.idle-tick-time-ms", Unit.MILLISECOND, "Time idle"),
+ CLUSTER_CONTROLLER_WORK_MS("cluster-controller.work-ms", Unit.MILLISECOND, "Time used for actual work"),
+ CLUSTER_CONTROLLER_IS_MASTER("cluster-controller.is-master", Unit.BINARY, "1 if this cluster controller is currently the master, or 0 if not"),
+ CLUSTER_CONTROLLER_REMOTE_TASK_QUEUE_SIZE("cluster-controller.remote-task-queue.size", Unit.OPERATION, "Number of remote tasks queued"),
+ // TODO(hakonhall): Update this name once persistent "count" metrics has been implemented.
+ // DO NOT RELY ON THIS METRIC YET.
+ CLUSTER_CONTROLLER_NODE_EVENT_COUNT("cluster-controller.node-event.count", Unit.OPERATION, "Number of node events"),
+ CLUSTER_CONTROLLER_RESOURCE_USAGE_NODES_ABOVE_LIMIT("cluster-controller.resource_usage.nodes_above_limit", Unit.NODE, "The number of content nodes above resource limit, blocking feed"),
+ CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION("cluster-controller.resource_usage.max_memory_utilization", Unit.FRACTION, "Current memory utilisation, per content node"),
+ CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION("cluster-controller.resource_usage.max_disk_utilization", Unit.FRACTION, "Current disk space utilisation, per content node"),
+ CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT("cluster-controller.resource_usage.memory_limit", Unit.FRACTION, "Disk space limit as a fraction of available disk space"),
+ CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT("cluster-controller.resource_usage.disk_limit", Unit.FRACTION, "Memory space limit as a fraction of available memory"),
+ CLUSTER_CONTROLLER_REINDEXING_PROGRESS("reindexing.progress", Unit.FRACTION, "Re-indexing progress");
+
+
+
private final String name;
private final Unit unit;
private final String description;
diff --git a/container-core/src/main/java/com/yahoo/metrics/Unit.java b/container-core/src/main/java/com/yahoo/metrics/Unit.java
index 5a4a0108534..58289143337 100644
--- a/container-core/src/main/java/com/yahoo/metrics/Unit.java
+++ b/container-core/src/main/java/com/yahoo/metrics/Unit.java
@@ -5,6 +5,7 @@ package com.yahoo.metrics;
*/
public enum Unit {
+ BINARY(BaseUnit.BINARY),
BYTE(BaseUnit.BYTE),
CONNECTION(BaseUnit.CONNECTION),
DOCUMENT(BaseUnit.DOCUMENT),
@@ -14,6 +15,7 @@ public enum Unit {
HIT_PER_QUERY(BaseUnit.HIT, BaseUnit.QUERY),
ITEM(BaseUnit.ITEM),
MILLISECOND(BaseUnit.MILLISECOND),
+ NODE(BaseUnit.NODE),
OPERATION(BaseUnit.OPERATION),
OPERATION_PER_SECOND(BaseUnit.OPERATION, BaseUnit.SECOND),
QUERY(BaseUnit.QUERY),
@@ -53,6 +55,7 @@ public enum Unit {
private enum BaseUnit {
+ BINARY("binary"),
BYTE("byte"),
CONNECTION("connection"),
DOCUMENT("document"),
@@ -60,6 +63,7 @@ public enum Unit {
HIT("hit"),
ITEM("item"),
MILLISECOND("millisecond", "ms"),
+ NODE("node"),
OPERATION("operation"),
QUERY("query"),
REQUEST("request"),