diff options
author | Kristian Aune <kkraune@users.noreply.github.com> | 2023-01-23 13:25:51 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-01-23 13:25:51 +0100 |
commit | d09223693dcd7e1445c3a5988fe66fb0818a47bc (patch) | |
tree | a7149699c99df14b710204654fef54d44fb4fe66 | |
parent | 1e0590e752905d50735ae3bb968e559d085ddf01 (diff) | |
parent | e4b791ce79ee6c25d74153691117fe2a11144fb0 (diff) |
Merge pull request #25678 from vespa-engine/yngveaasheim/use-enums-for-cluster-controller-metrics
Use enums for cluster controller metrics
4 files changed, 60 insertions, 29 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java index 7d4c4129640..d73385831ff 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java @@ -237,32 +237,32 @@ public class VespaMetricSet { private static Set<Metric> getClusterControllerMetrics() { Set<Metric> metrics = new LinkedHashSet<>(); - metrics.add(new Metric("cluster-controller.down.count.last")); - metrics.add(new Metric("cluster-controller.initializing.count.last")); - metrics.add(new Metric("cluster-controller.maintenance.count.last")); - metrics.add(new Metric("cluster-controller.retired.count.last")); - metrics.add(new Metric("cluster-controller.stopping.count.last")); - metrics.add(new Metric("cluster-controller.up.count.last")); - metrics.add(new Metric("cluster-controller.cluster-state-change.count")); - addMetric(metrics, "cluster-controller.busy-tick-time-ms", List.of("last", "max", "sum", "count")); - addMetric(metrics, "cluster-controller.idle-tick-time-ms", List.of("last", "max", "sum", "count")); - - addMetric(metrics, "cluster-controller.work-ms", List.of("last", "sum", "count")); - - metrics.add(new Metric("cluster-controller.is-master.last")); - metrics.add(new Metric("cluster-controller.remote-task-queue.size.last")); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_DOWN_COUNT.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_INITIALIZING_COUNT.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_MAINTENANCE_COUNT.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RETIRED_COUNT.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_STOPPING_COUNT.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_UP_COUNT.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_CLUSTER_STATE_CHANGE_COUNT.baseName()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_BUSY_TICK_TIME_MS, EnumSet.of(last, max, sum, count)); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_IDLE_TICK_TIME_MS, EnumSet.of(last, max, sum, count)); + + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_WORK_MS, EnumSet.of(last, sum, count)); + + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_IS_MASTER.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_REMOTE_TASK_QUEUE_SIZE.last()); // TODO(hakonhall): Update this name once persistent "count" metrics has been implemented. // DO NOT RELY ON THIS METRIC YET. - metrics.add(new Metric("cluster-controller.node-event.count")); - - addMetric(metrics, "cluster-controller.resource_usage.nodes_above_limit", List.of("last", "max")); - addMetric(metrics, "cluster-controller.resource_usage.max_memory_utilization", List.of("last", "max")); - addMetric(metrics, "cluster-controller.resource_usage.max_disk_utilization", List.of("last", "max")); - metrics.add(new Metric("cluster-controller.resource_usage.disk_limit.last")); - metrics.add(new Metric("cluster-controller.resource_usage.memory_limit.last")); - - metrics.add(new Metric("reindexing.progress.last")); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_NODE_EVENT_COUNT.baseName()); + + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_NODES_ABOVE_LIMIT, EnumSet.of(last, max)); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION, EnumSet.of(last, max)); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION, EnumSet.of(last, max)); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT.last()); + addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_REINDEXING_PROGRESS.last()); + return metrics; } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java index c1bc1027690..98f01bada3f 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.config.server.metrics; import ai.vespa.util.http.hc5.VespaHttpClientBuilder; import com.yahoo.concurrent.DaemonThreadFactory; +import com.yahoo.metrics.ContainerMetrics; import com.yahoo.slime.ArrayTraverser; import com.yahoo.slime.Cursor; import com.yahoo.slime.Inspector; @@ -145,12 +146,12 @@ public class ClusterDeploymentMetricsRetriever { .ifPresent(docCount -> aggregator.get().addDocumentCount(docCount)); break; case VESPA_CONTAINER_CLUSTERCONTROLLER: - optionalDouble(values.field("cluster-controller.resource_usage.max_memory_utilization.max")).ifPresent(memoryUtil -> + optionalDouble(values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION.max())).ifPresent(memoryUtil -> aggregator.get() .addMemoryUsage(memoryUtil, - values.field("cluster-controller.resource_usage.memory_limit.last").asDouble()) - .addDiskUsage(values.field("cluster-controller.resource_usage.max_disk_utilization.max").asDouble(), - values.field("cluster-controller.resource_usage.disk_limit.last").asDouble())); + values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT.last()).asDouble()) + .addDiskUsage(values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION.max()).asDouble(), + values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT.last()).asDouble())); break; } } diff --git a/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java b/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java index 513cf499362..35feb291c84 100644 --- a/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java +++ b/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java @@ -150,8 +150,34 @@ public enum ContainerMetrics { // Deprecated metrics. TODO: Remove on Vespa 9. SERVER_REJECTED_REQUESTS("serverRejectedRequests", Unit.OPERATION, "Deprecated. Use jdisc.thread_pool.rejected_tasks instead."), SERVER_THREAD_POOL_SIZE("serverThreadPoolSize", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.size instead."), - SERVER_ACTIVE_THREADS("serverActiveThreads", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.active_threads instead."); - + SERVER_ACTIVE_THREADS("serverActiveThreads", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.active_threads instead."), + + + // Metrics from the cluster controller + CLUSTER_CONTROLLER_DOWN_COUNT("cluster-controller.down.count", Unit.NODE, "Number of content nodes down"), + CLUSTER_CONTROLLER_INITIALIZING_COUNT("cluster-controller.initializing.count", Unit.NODE, "Number of content nodes initializing"), + CLUSTER_CONTROLLER_MAINTENANCE_COUNT("cluster-controller.maintenance.count", Unit.NODE, "Number of content nodes in maintenance"), + CLUSTER_CONTROLLER_RETIRED_COUNT("cluster-controller.retired.count", Unit.NODE, "Number of content nodes that are retired"), + CLUSTER_CONTROLLER_STOPPING_COUNT("cluster-controller.stopping.count", Unit.NODE, "Number of content nodes currently stopping"), + CLUSTER_CONTROLLER_UP_COUNT("cluster-controller.up.count", Unit.NODE, "Number of content nodes up"), + CLUSTER_CONTROLLER_CLUSTER_STATE_CHANGE_COUNT("cluster-controller.cluster-state-change.count", Unit.NODE, "Number of nodes changing state"), + CLUSTER_CONTROLLER_BUSY_TICK_TIME_MS("cluster-controller.busy-tick-time-ms", Unit.MILLISECOND, "Time busy"), + CLUSTER_CONTROLLER_IDLE_TICK_TIME_MS("cluster-controller.idle-tick-time-ms", Unit.MILLISECOND, "Time idle"), + CLUSTER_CONTROLLER_WORK_MS("cluster-controller.work-ms", Unit.MILLISECOND, "Time used for actual work"), + CLUSTER_CONTROLLER_IS_MASTER("cluster-controller.is-master", Unit.BINARY, "1 if this cluster controller is currently the master, or 0 if not"), + CLUSTER_CONTROLLER_REMOTE_TASK_QUEUE_SIZE("cluster-controller.remote-task-queue.size", Unit.OPERATION, "Number of remote tasks queued"), + // TODO(hakonhall): Update this name once persistent "count" metrics has been implemented. + // DO NOT RELY ON THIS METRIC YET. + CLUSTER_CONTROLLER_NODE_EVENT_COUNT("cluster-controller.node-event.count", Unit.OPERATION, "Number of node events"), + CLUSTER_CONTROLLER_RESOURCE_USAGE_NODES_ABOVE_LIMIT("cluster-controller.resource_usage.nodes_above_limit", Unit.NODE, "The number of content nodes above resource limit, blocking feed"), + CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION("cluster-controller.resource_usage.max_memory_utilization", Unit.FRACTION, "Current memory utilisation, per content node"), + CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION("cluster-controller.resource_usage.max_disk_utilization", Unit.FRACTION, "Current disk space utilisation, per content node"), + CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT("cluster-controller.resource_usage.memory_limit", Unit.FRACTION, "Disk space limit as a fraction of available disk space"), + CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT("cluster-controller.resource_usage.disk_limit", Unit.FRACTION, "Memory space limit as a fraction of available memory"), + CLUSTER_CONTROLLER_REINDEXING_PROGRESS("reindexing.progress", Unit.FRACTION, "Re-indexing progress"); + + + private final String name; private final Unit unit; private final String description; diff --git a/container-core/src/main/java/com/yahoo/metrics/Unit.java b/container-core/src/main/java/com/yahoo/metrics/Unit.java index 5a4a0108534..58289143337 100644 --- a/container-core/src/main/java/com/yahoo/metrics/Unit.java +++ b/container-core/src/main/java/com/yahoo/metrics/Unit.java @@ -5,6 +5,7 @@ package com.yahoo.metrics; */ public enum Unit { + BINARY(BaseUnit.BINARY), BYTE(BaseUnit.BYTE), CONNECTION(BaseUnit.CONNECTION), DOCUMENT(BaseUnit.DOCUMENT), @@ -14,6 +15,7 @@ public enum Unit { HIT_PER_QUERY(BaseUnit.HIT, BaseUnit.QUERY), ITEM(BaseUnit.ITEM), MILLISECOND(BaseUnit.MILLISECOND), + NODE(BaseUnit.NODE), OPERATION(BaseUnit.OPERATION), OPERATION_PER_SECOND(BaseUnit.OPERATION, BaseUnit.SECOND), QUERY(BaseUnit.QUERY), @@ -53,6 +55,7 @@ public enum Unit { private enum BaseUnit { + BINARY("binary"), BYTE("byte"), CONNECTION("connection"), DOCUMENT("document"), @@ -60,6 +63,7 @@ public enum Unit { HIT("hit"), ITEM("item"), MILLISECOND("millisecond", "ms"), + NODE("node"), OPERATION("operation"), QUERY("query"), REQUEST("request"), |