aboutsummaryrefslogtreecommitdiffstats
path: root/metrics/src/main/java
diff options
context:
space:
mode:
authoryngveaasheim <yngve@yahooinc.com>2023-08-02 12:32:09 +0200
committeryngveaasheim <yngve@yahooinc.com>2023-08-02 12:32:09 +0200
commit5e666c21ebd5a7a8decfc7ebfc31360b433b2ed5 (patch)
treee9ea75597c8f6dee11558b25751cccb8dae21ffc /metrics/src/main/java
parent83e8986e9aeb5a703bcd96299d102f36e0934705 (diff)
Include remaining metrics used for alerts
Diffstat (limited to 'metrics/src/main/java')
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java4
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java48
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java19
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java2
4 files changed, 65 insertions, 8 deletions
diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
index ca028547171..2a6a2986d9b 100644
--- a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
+++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
@@ -135,7 +135,6 @@ public enum ConfigServerMetrics implements VespaMetrics {
HOSTED_VESPA_READY_NODES("hostedVespa.readyNodes", Unit.HOST, "The number of managed nodes that are in state \"ready\""),
HOSTED_VESPA_RESERVED_NODES("hostedVespa.reservedNodes", Unit.HOST, "The number of managed nodes that are in state \"reserved\""),
-
OVERCOMMITTED_HOSTS("overcommittedHosts", Unit.HOST, "The number of hosts with over-committed resources"),
SPARE_HOST_CAPACITY("spareHostCapacity", Unit.HOST, "The number of spare hosts"),
THROTTLED_HOST_FAILURES("throttledHostFailures", Unit.HOST, "Number of host failures stopped due to throttling"),
@@ -143,8 +142,9 @@ public enum ConfigServerMetrics implements VespaMetrics {
NODE_FAIL_THROTTLING("nodeFailThrottling", Unit.BINARY, "Metric indicating when node failure throttling is active. The value 1 means active, 0 means inactive"),
DEPLOYMENT_PREPARE_MILLIS("deployment.prepareMillis", Unit.MILLISECOND, "Duration of deployment preparations"),
- DEPLOYMENT_ACTIVATE_MILLIS("deployment.activateMillis", Unit.MILLISECOND, "Duration of deployment activations");
+ DEPLOYMENT_ACTIVATE_MILLIS("deployment.activateMillis", Unit.MILLISECOND, "Duration of deployment activations"),
+ THROTTLED_HOST_PROVISIONING("throttledHostProvisioning", Unit.BINARY, "Value 1 if host provisioning is throttled, 0 if not");
private final String name;
private final Unit unit;
diff --git a/metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java b/metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java
index 1dc069fed6f..9e23a7625cb 100644
--- a/metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java
+++ b/metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java
@@ -6,6 +6,9 @@ import ai.vespa.metrics.ContainerMetrics;
import ai.vespa.metrics.SearchNodeMetrics;
import ai.vespa.metrics.StorageMetrics;
import ai.vespa.metrics.DistributorMetrics;
+import ai.vespa.metrics.ClusterControllerMetrics;
+import ai.vespa.metrics.SentinelMetrics;
+import ai.vespa.metrics.NodeAdminMetrics;
import ai.vespa.metrics.Suffix;
import ai.vespa.metrics.VespaMetrics;
@@ -16,6 +19,7 @@ import java.util.Set;
import static ai.vespa.metrics.Suffix.average;
import static ai.vespa.metrics.Suffix.count;
+import static ai.vespa.metrics.Suffix.last;
import static ai.vespa.metrics.Suffix.max;
import static ai.vespa.metrics.Suffix.min;
import static ai.vespa.metrics.Suffix.ninety_five_percentile;
@@ -47,9 +51,12 @@ public class DefaultMetrics {
addContainerMetrics(metrics);
addSearchChainMetrics(metrics);
addDocprocMetrics(metrics);
+ addSearchNodeMetrics(metrics);
addContentMetrics(metrics);
addStorageMetrics(metrics);
addDistributorMetrics(metrics);
+ addClusterControllerMetrics(metrics);
+ addOtherMetrics(metrics);
return Collections.unmodifiableSet(metrics);
}
@@ -69,6 +76,15 @@ public class DefaultMetrics {
addMetric(metrics, ContainerMetrics.JDISC_THREAD_POOL_WORK_QUEUE_CAPACITY.max());
addMetric(metrics, ContainerMetrics.JDISC_THREAD_POOL_WORK_QUEUE_SIZE, EnumSet.of(sum, count, min, max));
addMetric(metrics, ContainerMetrics.SERVER_ACTIVE_THREADS.average());
+
+ // Metrics needed for alerting
+ addMetric(metrics, ContainerMetrics.JDISC_SINGLETON_IS_ACTIVE.last());
+ addMetric(metrics, ContainerMetrics.JDISC_HTTP_SSL_HANDSHAKE_FAILURE_MISSING_CLIENT_CERT.rate());
+ addMetric(metrics, ContainerMetrics.JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INCOMPATIBLE_PROTOCOLS.rate());
+ addMetric(metrics, ContainerMetrics.JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INCOMPATIBLE_CHIFERS.rate());
+ addMetric(metrics, ContainerMetrics.JDISC_HTTP_SSL_HANDSHAKE_FAILURE_UNKNOWN.rate());
+ addMetric(metrics, ContainerMetrics.JDISC_APPLICATION_FAILED_COMPONENT_GRAPHS.rate());
+ addMetric(metrics, ContainerMetrics.ATHENZ_TENANT_CERT_EXPIRY_SECONDS.last());
}
private static void addSearchChainMetrics(Set<Metric> metrics) {
@@ -84,6 +100,13 @@ public class DefaultMetrics {
addMetric(metrics, ContainerMetrics.DOCPROC_DOCUMENTS.sum());
}
+ private static void addSearchNodeMetrics(Set<Metric> metrics) {
+ // Metrics needed for alerting
+ addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_DISK.average());
+ addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average());
+ addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_FEEDING_BLOCKED, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last
+ }
+
private static void addContentMetrics(Set<Metric> metrics) {
addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_REQUESTED_DOCUMENTS.rate());
addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_LATENCY, EnumSet.of(sum, count, max, average)); // TODO: Remove average with Vespa 9
@@ -114,6 +137,31 @@ public class DefaultMetrics {
private static void addDistributorMetrics(Set<Metric> metrics) {
addMetric(metrics, DistributorMetrics.VDS_DISTRIBUTOR_DOCSSTORED.average());
+
+ // Metrics needed for alerting
+ addMetric(metrics, DistributorMetrics.VDS_BOUNCER_CLOCK_SKEW_ABORTS.count());
+ }
+
+ private static void addClusterControllerMetrics(Set<Metric> metrics) {
+ // Metrics needed for alerting
+ addMetric(metrics, ClusterControllerMetrics.DOWN_COUNT.last());
+ addMetric(metrics, ClusterControllerMetrics.MAINTENANCE_COUNT.last());
+ addMetric(metrics, ClusterControllerMetrics.UP_COUNT.last());
+ addMetric(metrics, ClusterControllerMetrics.IS_MASTER.last());
+ addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_NODES_ABOVE_LIMIT, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last
+ addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_MAX_MEMORY_UTILIZATION, EnumSet.of(last, max)); // TODO: Vespa 9: Remove last
+ addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_MAX_DISK_UTILIZATION, EnumSet.of(last, max)); // TODO: Vespa 9: Remove last
+ }
+
+ private static void addSentinelMetrics(Set<Metric> metrics) {
+ // Metrics needed for alerting
+ addMetric(metrics, SentinelMetrics.SENTINEL_TOTAL_RESTARTS.last());
+ }
+
+ private static void addOtherMetrics(Set<Metric> metrics) {
+ // Metrics needed for alerting
+ addMetric(metrics, NodeAdminMetrics.ENDPOINT_CERTIFICATE_EXPIRY_SECONDS.baseName());
+ addMetric(metrics, NodeAdminMetrics.NODE_CERTIFICATE_EXPIRY_SECONDS.baseName());
}
private static void addMetric(Set<Metric> metrics, String nameWithSuffix) {
diff --git a/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java b/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java
index c64be82b937..571d292b54d 100644
--- a/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java
+++ b/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java
@@ -73,14 +73,14 @@ public class InfrastructureMetricSet {
addMetric(metrics, ConfigServerMetrics.WANT_TO_CHANGE_VESPA_VERSION.max());
addMetric(metrics, ConfigServerMetrics.HAS_WIRE_GUARD_KEY.last());
addMetric(metrics, ConfigServerMetrics.WANT_TO_DEPROVISION.max());
- addMetric(metrics, ConfigServerMetrics.SUSPENDED.max());
+ addMetric(metrics, ConfigServerMetrics.SUSPENDED, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last
addMetric(metrics, ConfigServerMetrics.SOME_SERVICES_DOWN.max());
addMetric(metrics, ConfigServerMetrics.NODE_FAILER_BAD_NODE.last());
addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max,average));
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU.average());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM.average());
- addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK.average());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU, EnumSet.of(average, last)); // TODO: Vespa 9: Remove last?
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM, EnumSet.of(average, last)); // TODO: Vespa 9: Remove last?
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK, EnumSet.of(average, last)); // TODO: Vespa 9: Remove last?
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_CPU.max());
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_MEM.max());
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_DISK.max());
@@ -96,7 +96,10 @@ public class InfrastructureMetricSet {
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PROVISIONED_HOSTS.last());
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_READY_HOSTS.max());
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_RESERVED_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PARKED_HOSTS, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_ACTIVE_NODES.max());
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_FAILED_NODES.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PARKED_NODES, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last
addMetric(metrics, ConfigServerMetrics.RPC_SERVER_WORK_QUEUE_SIZE.average());
addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_ACTIVATE_MILLIS.last());
@@ -108,7 +111,13 @@ public class InfrastructureMetricSet {
addMetric(metrics, ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE.count());
addMetric(metrics, ConfigServerMetrics.OVERCOMMITTED_HOSTS.max());
addMetric(metrics, ConfigServerMetrics.SPARE_HOST_CAPACITY.last());
- addMetric(metrics, ConfigServerMetrics.THROTTLED_NODE_FAILURES.max());
+ addMetric(metrics, ConfigServerMetrics.THROTTLED_NODE_FAILURES, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last
+ addMetric(metrics, ConfigServerMetrics.NODE_FAIL_THROTTLING.last());
+
+ addMetric(metrics, ConfigServerMetrics.ORCHESTRATOR_LOCK_ACQUIRE_SUCCESS.count());
+ addMetric(metrics, ConfigServerMetrics.ORCHESTRATOR_LOCK_ACQUIRE_TIMEOUT.count());
+ addMetric(metrics, ConfigServerMetrics.ZONE_WORKING.last());
+ addMetric(metrics, ConfigServerMetrics.THROTTLED_HOST_PROVISIONING.max());
// Container metrics that should be stored for the config-server
addMetric(metrics, ContainerMetrics.HANDLED_LATENCY.max());
diff --git a/metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java b/metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java
index 3ae3b002ec5..4ec596f8ce7 100644
--- a/metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java
+++ b/metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java
@@ -434,7 +434,7 @@ public class VespaMetricSet {
addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY_USAGE_TRANSIENT.max());
addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY_MAPPINGS.max());
addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_OPEN_FILE_DESCRIPTORS.max());
- addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_FEEDING_BLOCKED.max());
+ addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_FEEDING_BLOCKED, EnumSet.of(max,last)); // TODO: Vespa 9: Remove last
addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MALLOC_ARENA.max());
addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_RESOURCE_USAGE_ADDRESS_SPACE.max());
addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_RESOURCE_USAGE_FEEDING_BLOCKED.max());