diff options
4 files changed, 83 insertions, 50 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java index 5f8d8148b41..f9f7f3a00ae 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.model.admin.monitoring; +import com.yahoo.metrics.ClusterControllerMetrics; import com.yahoo.metrics.ConfigServerMetrics; import com.yahoo.metrics.ContainerMetrics; import com.yahoo.metrics.DistributorMetrics; @@ -254,31 +255,29 @@ public class VespaMetricSet { private static Set<Metric> getClusterControllerMetrics() { Set<Metric> metrics = new LinkedHashSet<>(); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_DOWN_COUNT.last()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_INITIALIZING_COUNT.last()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_MAINTENANCE_COUNT.last()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RETIRED_COUNT.last()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_STOPPING_COUNT.last()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_UP_COUNT.last()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_CLUSTER_STATE_CHANGE_COUNT.baseName()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_BUSY_TICK_TIME_MS, EnumSet.of(last, max, sum, count)); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_IDLE_TICK_TIME_MS, EnumSet.of(last, max, sum, count)); + addMetric(metrics, ClusterControllerMetrics.DOWN_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.INITIALIZING_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.MAINTENANCE_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.RETIRED_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.STOPPING_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.UP_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.CLUSTER_STATE_CHANGE_COUNT.baseName()); + addMetric(metrics, ClusterControllerMetrics.BUSY_TICK_TIME_MS, EnumSet.of(last, max, sum, count)); + addMetric(metrics, ClusterControllerMetrics.IDLE_TICK_TIME_MS, EnumSet.of(last, max, sum, count)); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_WORK_MS, EnumSet.of(last, sum, count)); - - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_IS_MASTER.last()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_REMOTE_TASK_QUEUE_SIZE.last()); + addMetric(metrics, ClusterControllerMetrics.WORK_MS, EnumSet.of(last, sum, count)); + + addMetric(metrics, ClusterControllerMetrics.IS_MASTER.last()); + addMetric(metrics, ClusterControllerMetrics.REMOTE_TASK_QUEUE_SIZE.last()); // TODO(hakonhall): Update this name once persistent "count" metrics has been implemented. // DO NOT RELY ON THIS METRIC YET. - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_NODE_EVENT_COUNT.baseName()); - - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_NODES_ABOVE_LIMIT, EnumSet.of(last, max)); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION, EnumSet.of(last, max)); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION, EnumSet.of(last, max)); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT.last()); - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT.last()); - - addMetric(metrics, ContainerMetrics.CLUSTER_CONTROLLER_REINDEXING_PROGRESS.last()); + addMetric(metrics, ClusterControllerMetrics.NODE_EVENT_COUNT.baseName()); + addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_NODES_ABOVE_LIMIT, EnumSet.of(last, max)); + addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_MAX_MEMORY_UTILIZATION, EnumSet.of(last, max)); + addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_MAX_DISK_UTILIZATION, EnumSet.of(last, max)); + addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_MEMORY_LIMIT.last()); + addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_DISK_LIMIT.last()); + addMetric(metrics, ClusterControllerMetrics.REINDEXING_PROGRESS.last()); return metrics; } @@ -710,6 +709,10 @@ public class VespaMetricSet { metrics.add(new Metric(nameWithSuffix)); } + private static void addMetric(Set<Metric> metrics, ClusterControllerMetrics metric, EnumSet<Suffix> suffixes) { + suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix()))); + } + private static void addMetric(Set<Metric> metrics, ContainerMetrics metric, EnumSet<Suffix> suffixes) { suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix()))); } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java index 61c0c17264c..7920bbed763 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.config.server.metrics; import ai.vespa.util.http.hc5.VespaHttpClientBuilder; import com.yahoo.concurrent.DaemonThreadFactory; +import com.yahoo.metrics.ClusterControllerMetrics; import com.yahoo.metrics.ContainerMetrics; import com.yahoo.slime.ArrayTraverser; import com.yahoo.slime.Cursor; @@ -137,11 +138,11 @@ public class ClusterDeploymentMetricsRetriever { case VESPA_DISTRIBUTOR -> optionalDouble(values.field("vds.distributor.docsstored.average")) .ifPresent(docCount -> aggregator.get().addDocumentCount(docCount)); case VESPA_CONTAINER_CLUSTERCONTROLLER -> - optionalDouble(values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION.max())).ifPresent(memoryUtil -> + optionalDouble(values.field(ClusterControllerMetrics.RESOURCE_USAGE_MAX_MEMORY_UTILIZATION.max())).ifPresent(memoryUtil -> aggregator.get() - .addMemoryUsage(memoryUtil, values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT.last()).asDouble()) - .addDiskUsage(values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION.max()).asDouble(), - values.field(ContainerMetrics.CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT.last()).asDouble())); + .addMemoryUsage(memoryUtil, values.field(ClusterControllerMetrics.RESOURCE_USAGE_MEMORY_LIMIT.last()).asDouble()) + .addDiskUsage(values.field(ClusterControllerMetrics.RESOURCE_USAGE_MAX_DISK_UTILIZATION.max()).asDouble(), + values.field(ClusterControllerMetrics.RESOURCE_USAGE_DISK_LIMIT.last()).asDouble())); } } diff --git a/container-core/src/main/java/com/yahoo/metrics/ClusterControllerMetrics.java b/container-core/src/main/java/com/yahoo/metrics/ClusterControllerMetrics.java new file mode 100644 index 00000000000..fabfd5504f7 --- /dev/null +++ b/container-core/src/main/java/com/yahoo/metrics/ClusterControllerMetrics.java @@ -0,0 +1,53 @@ +package com.yahoo.metrics; + +/** + * @author yngve + */ +public enum ClusterControllerMetrics implements VespaMetrics { + + DOWN_COUNT("cluster-controller.down.count", Unit.NODE, "Number of content nodes down"), + INITIALIZING_COUNT("cluster-controller.initializing.count", Unit.NODE, "Number of content nodes initializing"), + MAINTENANCE_COUNT("cluster-controller.maintenance.count", Unit.NODE, "Number of content nodes in maintenance"), + RETIRED_COUNT("cluster-controller.retired.count", Unit.NODE, "Number of content nodes that are retired"), + STOPPING_COUNT("cluster-controller.stopping.count", Unit.NODE, "Number of content nodes currently stopping"), + UP_COUNT("cluster-controller.up.count", Unit.NODE, "Number of content nodes up"), + CLUSTER_STATE_CHANGE_COUNT("cluster-controller.cluster-state-change.count", Unit.NODE, "Number of nodes changing state"), + BUSY_TICK_TIME_MS("cluster-controller.busy-tick-time-ms", Unit.MILLISECOND, "Time busy"), + IDLE_TICK_TIME_MS("cluster-controller.idle-tick-time-ms", Unit.MILLISECOND, "Time idle"), + WORK_MS("cluster-controller.work-ms", Unit.MILLISECOND, "Time used for actual work"), + IS_MASTER("cluster-controller.is-master", Unit.BINARY, "1 if this cluster controller is currently the master, or 0 if not"), + REMOTE_TASK_QUEUE_SIZE("cluster-controller.remote-task-queue.size", Unit.OPERATION, "Number of remote tasks queued"), + // TODO(hakonhall): Update this name once persistent "count" metrics has been implemented. + // DO NOT RELY ON THIS METRIC YET. + NODE_EVENT_COUNT("cluster-controller.node-event.count", Unit.OPERATION, "Number of node events"), + RESOURCE_USAGE_NODES_ABOVE_LIMIT("cluster-controller.resource_usage.nodes_above_limit", Unit.NODE, "The number of content nodes above resource limit, blocking feed"), + RESOURCE_USAGE_MAX_MEMORY_UTILIZATION("cluster-controller.resource_usage.max_memory_utilization", Unit.FRACTION, "Current memory utilisation, per content node"), + RESOURCE_USAGE_MAX_DISK_UTILIZATION("cluster-controller.resource_usage.max_disk_utilization", Unit.FRACTION, "Current disk space utilisation, per content node"), + RESOURCE_USAGE_MEMORY_LIMIT("cluster-controller.resource_usage.memory_limit", Unit.FRACTION, "Disk space limit as a fraction of available disk space"), + RESOURCE_USAGE_DISK_LIMIT("cluster-controller.resource_usage.disk_limit", Unit.FRACTION, "Memory space limit as a fraction of available memory"), + REINDEXING_PROGRESS("reindexing.progress", Unit.FRACTION, "Re-indexing progress"); + + + private final String name; + private final Unit unit; + private final String description; + + ClusterControllerMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java b/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java index ed1d6f7a001..4da028a8b05 100644 --- a/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java +++ b/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java @@ -163,30 +163,6 @@ public enum ContainerMetrics implements VespaMetrics { SERVER_THREAD_POOL_SIZE("serverThreadPoolSize", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.size instead."), SERVER_ACTIVE_THREADS("serverActiveThreads", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.active_threads instead."), - - // Metrics from the cluster controller - CLUSTER_CONTROLLER_DOWN_COUNT("cluster-controller.down.count", Unit.NODE, "Number of content nodes down"), - CLUSTER_CONTROLLER_INITIALIZING_COUNT("cluster-controller.initializing.count", Unit.NODE, "Number of content nodes initializing"), - CLUSTER_CONTROLLER_MAINTENANCE_COUNT("cluster-controller.maintenance.count", Unit.NODE, "Number of content nodes in maintenance"), - CLUSTER_CONTROLLER_RETIRED_COUNT("cluster-controller.retired.count", Unit.NODE, "Number of content nodes that are retired"), - CLUSTER_CONTROLLER_STOPPING_COUNT("cluster-controller.stopping.count", Unit.NODE, "Number of content nodes currently stopping"), - CLUSTER_CONTROLLER_UP_COUNT("cluster-controller.up.count", Unit.NODE, "Number of content nodes up"), - CLUSTER_CONTROLLER_CLUSTER_STATE_CHANGE_COUNT("cluster-controller.cluster-state-change.count", Unit.NODE, "Number of nodes changing state"), - CLUSTER_CONTROLLER_BUSY_TICK_TIME_MS("cluster-controller.busy-tick-time-ms", Unit.MILLISECOND, "Time busy"), - CLUSTER_CONTROLLER_IDLE_TICK_TIME_MS("cluster-controller.idle-tick-time-ms", Unit.MILLISECOND, "Time idle"), - CLUSTER_CONTROLLER_WORK_MS("cluster-controller.work-ms", Unit.MILLISECOND, "Time used for actual work"), - CLUSTER_CONTROLLER_IS_MASTER("cluster-controller.is-master", Unit.BINARY, "1 if this cluster controller is currently the master, or 0 if not"), - CLUSTER_CONTROLLER_REMOTE_TASK_QUEUE_SIZE("cluster-controller.remote-task-queue.size", Unit.OPERATION, "Number of remote tasks queued"), - // TODO(hakonhall): Update this name once persistent "count" metrics has been implemented. - // DO NOT RELY ON THIS METRIC YET. - CLUSTER_CONTROLLER_NODE_EVENT_COUNT("cluster-controller.node-event.count", Unit.OPERATION, "Number of node events"), - CLUSTER_CONTROLLER_RESOURCE_USAGE_NODES_ABOVE_LIMIT("cluster-controller.resource_usage.nodes_above_limit", Unit.NODE, "The number of content nodes above resource limit, blocking feed"), - CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_MEMORY_UTILIZATION("cluster-controller.resource_usage.max_memory_utilization", Unit.FRACTION, "Current memory utilisation, per content node"), - CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION("cluster-controller.resource_usage.max_disk_utilization", Unit.FRACTION, "Current disk space utilisation, per content node"), - CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT("cluster-controller.resource_usage.memory_limit", Unit.FRACTION, "Disk space limit as a fraction of available disk space"), - CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT("cluster-controller.resource_usage.disk_limit", Unit.FRACTION, "Memory space limit as a fraction of available memory"), - CLUSTER_CONTROLLER_REINDEXING_PROGRESS("reindexing.progress", Unit.FRACTION, "Re-indexing progress"), - // Java (JRT) TLS metrics JRT_TRANSPORT_TLS_CERTIFICATE_VERIFICATION_FAILURES("jrt.transport.tls-certificate-verification-failures", Unit.FAILURE, "TLS certificate verification failures"), JRT_TRANSPORT_PEER_AUTHORIZATION_FAILURES("jrt.transport.peer-authorization-failures", Unit.FAILURE, "TLS peer authorization failures"), |