aboutsummaryrefslogtreecommitdiffstats
path: root/config-model
diff options
context:
space:
mode:
authorYngve Aasheim <yngveaasheim@users.noreply.github.com>2023-08-01 10:20:45 +0200
committerGitHub <noreply@github.com>2023-08-01 10:20:45 +0200
commit5a682c36b257598915ad99d2fb8e29023b082119 (patch)
tree8c7fa7b6c2c2a4ee63ea0bd624cf95d25078c76d /config-model
parentdd36d60b716a1d785371e1316e33ced5871a1d5f (diff)
parentea9d53d09316663cf858eb8de2b5b7c754600a3a (diff)
Merge pull request #27935 from vespa-engine/yngveaasheim/prepare-metricset-config-server
Add separate infrastructure metric set
Diffstat (limited to 'config-model')
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/InfrastructureMetricSet.java180
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java51
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java4
3 files changed, 183 insertions, 52 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/InfrastructureMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/InfrastructureMetricSet.java
new file mode 100644
index 00000000000..92156c959a0
--- /dev/null
+++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/InfrastructureMetricSet.java
@@ -0,0 +1,180 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.model.admin.monitoring;
+
+import ai.vespa.metrics.*;
+
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import static ai.vespa.metrics.Suffix.average;
+import static ai.vespa.metrics.Suffix.count;
+import static ai.vespa.metrics.Suffix.last;
+import static ai.vespa.metrics.Suffix.max;
+import static ai.vespa.metrics.Suffix.sum;
+
+/**
+ * Encapsulates vespa service metrics.
+ *
+ * @author yngveaasheim
+ */
+public class InfrastructureMetricSet {
+
+ public static final MetricSet infrastructureMetricSet = new MetricSet("infrastructure",
+ getInfrastructureMetrics());
+
+ private static Set<Metric> getInfrastructureMetrics() {
+ Set<Metric> metrics = new LinkedHashSet<>();
+
+ metrics.addAll(getConfigServerMetrics());
+ metrics.addAll(getControllerMetrics());
+ metrics.addAll(getOtherMetrics());
+
+ return Collections.unmodifiableSet(metrics);
+ }
+
+ private static Set<Metric> getConfigServerMetrics() {
+ Set<Metric> metrics = new LinkedHashSet<>();
+
+ addMetric(metrics, ConfigServerMetrics.REQUESTS.count());
+ addMetric(metrics, ConfigServerMetrics.FAILED_REQUESTS.count());
+ addMetric(metrics, ConfigServerMetrics.LATENCY, EnumSet.of(max, sum, count));
+ addMetric(metrics, ConfigServerMetrics.CACHE_CONFIG_ELEMS.last());
+ addMetric(metrics, ConfigServerMetrics.CACHE_CHECKSUM_ELEMS.last());
+ addMetric(metrics, ConfigServerMetrics.HOSTS.last());
+ addMetric(metrics, ConfigServerMetrics.DELAYED_RESPONSES.count());
+ addMetric(metrics, ConfigServerMetrics.SESSION_CHANGE_ERRORS.count());
+
+ addMetric(metrics, ConfigServerMetrics.ZK_Z_NODES, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.ZK_AVG_LATENCY, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.ZK_MAX_LATENCY, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.ZK_CONNECTION_LOST.count());
+ addMetric(metrics, ConfigServerMetrics.ZK_RECONNECTED.count());
+ addMetric(metrics, ConfigServerMetrics.ZK_SUSPENDED.count());
+ addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+
+ // Node repository metrics
+ addMetric(metrics, ConfigServerMetrics.NODES_NON_ACTIVE_FRACTION.last());
+ addMetric(metrics, ConfigServerMetrics.CLUSTER_COST.last());
+ addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.last());
+ addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.last());
+ addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.last());
+ addMetric(metrics, ConfigServerMetrics.WANT_TO_REBOOT.max());
+ addMetric(metrics, ConfigServerMetrics.WANT_TO_RESTART.max());
+ addMetric(metrics, ConfigServerMetrics.WANT_TO_RETIRE.max());
+ addMetric(metrics, ConfigServerMetrics.RETIRED.max());
+ addMetric(metrics, ConfigServerMetrics.WANT_TO_CHANGE_VESPA_VERSION.max());
+ addMetric(metrics, ConfigServerMetrics.HAS_WIRE_GUARD_KEY.last());
+ addMetric(metrics, ConfigServerMetrics.WANT_TO_DEPROVISION.max());
+ addMetric(metrics, ConfigServerMetrics.SUSPENDED.max());
+ addMetric(metrics, ConfigServerMetrics.SOME_SERVICES_DOWN.max());
+ addMetric(metrics, ConfigServerMetrics.NODE_FAILER_BAD_NODE.last());
+ addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max,average));
+
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU.average());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM.average());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK.average());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_CPU.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_MEM.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_DISK.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_CPU, EnumSet.of(max,average));
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_DISK, EnumSet.of(max,average));
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_MEM, EnumSet.of(max,average));
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_SKEW.last());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PENDING_REDEPLOYMENTS.last());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_ACTIVE_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DIRTY_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_FAILED_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_INACTIVE_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PROVISIONED_HOSTS.last());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_READY_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_RESERVED_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_FAILED_NODES.max());
+
+ addMetric(metrics, ConfigServerMetrics.RPC_SERVER_WORK_QUEUE_SIZE.average());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_ACTIVATE_MILLIS.last());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_PREPARE_MILLIS.last());
+
+ addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max, average));
+ addMetric(metrics, ConfigServerMetrics.MAINTENANCE_SUCCESS_FACTOR_DEVIATION.last());
+ addMetric(metrics, ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_FAILURE.count());
+ addMetric(metrics, ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE.count());
+ addMetric(metrics, ConfigServerMetrics.OVERCOMMITTED_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.SPARE_HOST_CAPACITY.last());
+ addMetric(metrics, ConfigServerMetrics.THROTTLED_NODE_FAILURES.max());
+
+ // Container metrics that should be stored for the config-server
+ addMetric(metrics, ContainerMetrics.HANDLED_LATENCY.max());
+ addMetric(metrics, ContainerMetrics.HANDLED_REQUESTS.count());
+ addMetric(metrics, ContainerMetrics.HTTP_STATUS_2XX.count());
+ addMetric(metrics, ContainerMetrics.HTTP_STATUS_4XX.count());
+ addMetric(metrics, ContainerMetrics.HTTP_STATUS_5XX.count());
+ addMetric(metrics, ContainerMetrics.JDISC_GC_MS.last());
+ addMetric(metrics, ContainerMetrics.MEM_HEAP_USED.average());
+ addMetric(metrics, ContainerMetrics.SERVER_NUM_REQUESTS.count());
+ addMetric(metrics, ContainerMetrics.SERVER_STARTED_MILLIS.last());
+ addMetric(metrics, ContainerMetrics.SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY.last());
+
+ return metrics;
+ }
+
+ private static Set<Metric> getControllerMetrics() {
+ Set<Metric> metrics = new LinkedHashSet<>();
+
+ addMetric(metrics, ControllerMetrics.ATHENZ_REQUEST_ERROR.count());
+ addMetric(metrics, ControllerMetrics.ARCHIVE_BUCKET_COUNT.last());
+ addMetric(metrics, ControllerMetrics.BILLING_TENANTS.last());
+
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_ABORT.count());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_AVERAGE_DURATION, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_CONVERGENCE_FAILURE.count());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_DEPLOYMENT_FAILURE.count());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_ERROR.count());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_FAILING_UPGRADES.last());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_FAILURE_PERCENTAGE.last());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_NODE_COUNT_BY_OS_VERSION.max());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_OS_CHANGE_DURATION.max());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_START.count());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_SUCCESS.count());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_TEST_FAILURE.count());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_WARNINGS.last());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_ENDPOINT_CERTIFICATE_TIMEOUT.count());
+ addMetric(metrics, ControllerMetrics.DEPLOYMENT_BROKEN_SYSTEM_VERSION.last());
+
+ addMetric(metrics, ControllerMetrics.OPERATION_APPLICATION.last());
+ addMetric(metrics, ControllerMetrics.OPERATION_CHANGEMANAGEMENT.last());
+ addMetric(metrics, ControllerMetrics.OPERATION_CONFIGSERVER.last());
+ addMetric(metrics, ControllerMetrics.OPERATION_CONTROLLER.last());
+ addMetric(metrics, ControllerMetrics.OPERATION_FLAGS.last());
+ addMetric(metrics, ControllerMetrics.OPERATION_OS.last());
+ addMetric(metrics, ControllerMetrics.OPERATION_ROUTING.last());
+ addMetric(metrics, ControllerMetrics.OPERATION_ZONE.last());
+
+ addMetric(metrics, ControllerMetrics.REMAINING_ROTATIONS.last());
+ addMetric(metrics, ControllerMetrics.DNS_QUEUED_REQUESTS.last());
+ addMetric(metrics, ControllerMetrics.ZMS_QUOTA_USAGE.last());
+ addMetric(metrics, ControllerMetrics.COREDUMP_PROCESSED.count());
+
+ addMetric(metrics, ControllerMetrics.METERING_AGE_SECONDS.last());
+
+ return metrics;
+ }
+
+ private static Set<Metric> getOtherMetrics() {
+ Set<Metric> metrics = new LinkedHashSet<>();
+
+ addMetric(metrics, LogdMetrics.LOGD_PROCESSED_LINES.count());
+
+ return metrics;
+ }
+
+ private static void addMetric(Set<Metric> metrics, String nameWithSuffix) {
+ metrics.add(new Metric(nameWithSuffix));
+ }
+
+ private static void addMetric(Set<Metric> metrics, VespaMetrics metric, EnumSet<Suffix> suffixes) {
+ suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix())));
+ }
+}
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
index 8a2bae364a1..a0d866fb001 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
@@ -2,7 +2,6 @@
package com.yahoo.vespa.model.admin.monitoring;
import ai.vespa.metrics.ClusterControllerMetrics;
-import ai.vespa.metrics.ConfigServerMetrics;
import ai.vespa.metrics.ContainerMetrics;
import ai.vespa.metrics.DistributorMetrics;
import ai.vespa.metrics.LogdMetrics;
@@ -53,7 +52,6 @@ public class VespaMetricSet {
metrics.addAll(getClusterControllerMetrics());
metrics.addAll(getSearchChainMetrics());
metrics.addAll(getContainerMetrics());
- metrics.addAll(getConfigServerMetrics());
metrics.addAll(getSentinelMetrics());
metrics.addAll(getOtherMetrics());
@@ -113,55 +111,6 @@ public class VespaMetricSet {
return metrics;
}
- private static Set<Metric> getConfigServerMetrics() {
- Set<Metric> metrics = new LinkedHashSet<>();
-
- addMetric(metrics, ConfigServerMetrics.REQUESTS.count());
- addMetric(metrics, ConfigServerMetrics.FAILED_REQUESTS.count());
- addMetric(metrics, ConfigServerMetrics.LATENCY, EnumSet.of(max, sum, count));
- addMetric(metrics, ConfigServerMetrics.CACHE_CONFIG_ELEMS.last());
- addMetric(metrics, ConfigServerMetrics.CACHE_CHECKSUM_ELEMS.last());
- addMetric(metrics, ConfigServerMetrics.HOSTS.last());
- addMetric(metrics, ConfigServerMetrics.DELAYED_RESPONSES.count());
- addMetric(metrics, ConfigServerMetrics.SESSION_CHANGE_ERRORS.count());
-
- addMetric(metrics, ConfigServerMetrics.ZK_Z_NODES.last());
- addMetric(metrics, ConfigServerMetrics.ZK_AVG_LATENCY.last());
- addMetric(metrics, ConfigServerMetrics.ZK_MAX_LATENCY.last());
- addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS.last());
- addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS.last());
-
- // Node repository metrics
- addMetric(metrics, ConfigServerMetrics.NODES_NON_ACTIVE_FRACTION.last());
- addMetric(metrics, ConfigServerMetrics.CLUSTER_COST.last());
- addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.last());
- addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.last());
- addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.last());
- addMetric(metrics, ConfigServerMetrics.WANT_TO_REBOOT.max());
- addMetric(metrics, ConfigServerMetrics.WANT_TO_RESTART.max());
- addMetric(metrics, ConfigServerMetrics.RETIRED.max());
- addMetric(metrics, ConfigServerMetrics.WANT_TO_CHANGE_VESPA_VERSION.max());
- addMetric(metrics, ConfigServerMetrics.HAS_WIRE_GUARD_KEY.last());
- addMetric(metrics, ConfigServerMetrics.WANT_TO_DEPROVISION.max());
- addMetric(metrics, ConfigServerMetrics.SUSPENDED.max());
- addMetric(metrics, ConfigServerMetrics.SOME_SERVICES_DOWN.max());
- addMetric(metrics, ConfigServerMetrics.NODE_FAILER_BAD_NODE.last());
- addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max,average));
-
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU.average());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM.average());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK.average());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_CPU.max());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_MEM.max());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_DISK.max());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_CPU, EnumSet.of(max,average));
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_DISK, EnumSet.of(max,average));
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_MEM, EnumSet.of(max,average));
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_SKEW.last());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PENDING_REDEPLOYMENTS.last());
-
- return metrics;
- }
private static Set<Metric> getContainerMetrics() {
Set<Metric> metrics = new LinkedHashSet<>();
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java
index 597a2da0f2c..d0a5b1bbe43 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java
@@ -13,6 +13,7 @@ import static com.yahoo.vespa.model.admin.monitoring.NetworkMetrics.networkMetri
import static com.yahoo.vespa.model.admin.monitoring.SystemMetrics.systemMetricSet;
import static com.yahoo.vespa.model.admin.monitoring.DefaultVespaMetrics.defaultVespaMetricSet;
import static com.yahoo.vespa.model.admin.monitoring.VespaMetricSet.vespaMetricSet;
+import static com.yahoo.vespa.model.admin.monitoring.InfrastructureMetricSet.infrastructureMetricSet;
/**
* A data object for predefined metric sets.
@@ -27,7 +28,8 @@ public class PredefinedMetricSets {
vespaMetricSet,
systemMetricSet,
networkMetricSet,
- autoscalingMetricSet
+ autoscalingMetricSet,
+ infrastructureMetricSet
);
public static Map<String, MetricSet> get() { return sets; }