diff options
author | yngveaasheim <yngve@yahooinc.com> | 2023-08-02 12:32:09 +0200 |
---|---|---|
committer | yngveaasheim <yngve@yahooinc.com> | 2023-08-02 12:32:09 +0200 |
commit | 5e666c21ebd5a7a8decfc7ebfc31360b433b2ed5 (patch) | |
tree | e9ea75597c8f6dee11558b25751cccb8dae21ffc /metrics | |
parent | 83e8986e9aeb5a703bcd96299d102f36e0934705 (diff) |
Include remaining metrics used for alerts
Diffstat (limited to 'metrics')
4 files changed, 65 insertions, 8 deletions
diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java index ca028547171..2a6a2986d9b 100644 --- a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java +++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java @@ -135,7 +135,6 @@ public enum ConfigServerMetrics implements VespaMetrics { HOSTED_VESPA_READY_NODES("hostedVespa.readyNodes", Unit.HOST, "The number of managed nodes that are in state \"ready\""), HOSTED_VESPA_RESERVED_NODES("hostedVespa.reservedNodes", Unit.HOST, "The number of managed nodes that are in state \"reserved\""), - OVERCOMMITTED_HOSTS("overcommittedHosts", Unit.HOST, "The number of hosts with over-committed resources"), SPARE_HOST_CAPACITY("spareHostCapacity", Unit.HOST, "The number of spare hosts"), THROTTLED_HOST_FAILURES("throttledHostFailures", Unit.HOST, "Number of host failures stopped due to throttling"), @@ -143,8 +142,9 @@ public enum ConfigServerMetrics implements VespaMetrics { NODE_FAIL_THROTTLING("nodeFailThrottling", Unit.BINARY, "Metric indicating when node failure throttling is active. The value 1 means active, 0 means inactive"), DEPLOYMENT_PREPARE_MILLIS("deployment.prepareMillis", Unit.MILLISECOND, "Duration of deployment preparations"), - DEPLOYMENT_ACTIVATE_MILLIS("deployment.activateMillis", Unit.MILLISECOND, "Duration of deployment activations"); + DEPLOYMENT_ACTIVATE_MILLIS("deployment.activateMillis", Unit.MILLISECOND, "Duration of deployment activations"), + THROTTLED_HOST_PROVISIONING("throttledHostProvisioning", Unit.BINARY, "Value 1 if host provisioning is throttled, 0 if not"); private final String name; private final Unit unit; diff --git a/metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java b/metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java index 1dc069fed6f..9e23a7625cb 100644 --- a/metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java +++ b/metrics/src/main/java/ai/vespa/metrics/set/DefaultMetrics.java @@ -6,6 +6,9 @@ import ai.vespa.metrics.ContainerMetrics; import ai.vespa.metrics.SearchNodeMetrics; import ai.vespa.metrics.StorageMetrics; import ai.vespa.metrics.DistributorMetrics; +import ai.vespa.metrics.ClusterControllerMetrics; +import ai.vespa.metrics.SentinelMetrics; +import ai.vespa.metrics.NodeAdminMetrics; import ai.vespa.metrics.Suffix; import ai.vespa.metrics.VespaMetrics; @@ -16,6 +19,7 @@ import java.util.Set; import static ai.vespa.metrics.Suffix.average; import static ai.vespa.metrics.Suffix.count; +import static ai.vespa.metrics.Suffix.last; import static ai.vespa.metrics.Suffix.max; import static ai.vespa.metrics.Suffix.min; import static ai.vespa.metrics.Suffix.ninety_five_percentile; @@ -47,9 +51,12 @@ public class DefaultMetrics { addContainerMetrics(metrics); addSearchChainMetrics(metrics); addDocprocMetrics(metrics); + addSearchNodeMetrics(metrics); addContentMetrics(metrics); addStorageMetrics(metrics); addDistributorMetrics(metrics); + addClusterControllerMetrics(metrics); + addOtherMetrics(metrics); return Collections.unmodifiableSet(metrics); } @@ -69,6 +76,15 @@ public class DefaultMetrics { addMetric(metrics, ContainerMetrics.JDISC_THREAD_POOL_WORK_QUEUE_CAPACITY.max()); addMetric(metrics, ContainerMetrics.JDISC_THREAD_POOL_WORK_QUEUE_SIZE, EnumSet.of(sum, count, min, max)); addMetric(metrics, ContainerMetrics.SERVER_ACTIVE_THREADS.average()); + + // Metrics needed for alerting + addMetric(metrics, ContainerMetrics.JDISC_SINGLETON_IS_ACTIVE.last()); + addMetric(metrics, ContainerMetrics.JDISC_HTTP_SSL_HANDSHAKE_FAILURE_MISSING_CLIENT_CERT.rate()); + addMetric(metrics, ContainerMetrics.JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INCOMPATIBLE_PROTOCOLS.rate()); + addMetric(metrics, ContainerMetrics.JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INCOMPATIBLE_CHIFERS.rate()); + addMetric(metrics, ContainerMetrics.JDISC_HTTP_SSL_HANDSHAKE_FAILURE_UNKNOWN.rate()); + addMetric(metrics, ContainerMetrics.JDISC_APPLICATION_FAILED_COMPONENT_GRAPHS.rate()); + addMetric(metrics, ContainerMetrics.ATHENZ_TENANT_CERT_EXPIRY_SECONDS.last()); } private static void addSearchChainMetrics(Set<Metric> metrics) { @@ -84,6 +100,13 @@ public class DefaultMetrics { addMetric(metrics, ContainerMetrics.DOCPROC_DOCUMENTS.sum()); } + private static void addSearchNodeMetrics(Set<Metric> metrics) { + // Metrics needed for alerting + addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_DISK.average()); + addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average()); + addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_FEEDING_BLOCKED, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last + } + private static void addContentMetrics(Set<Metric> metrics) { addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_REQUESTED_DOCUMENTS.rate()); addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_LATENCY, EnumSet.of(sum, count, max, average)); // TODO: Remove average with Vespa 9 @@ -114,6 +137,31 @@ public class DefaultMetrics { private static void addDistributorMetrics(Set<Metric> metrics) { addMetric(metrics, DistributorMetrics.VDS_DISTRIBUTOR_DOCSSTORED.average()); + + // Metrics needed for alerting + addMetric(metrics, DistributorMetrics.VDS_BOUNCER_CLOCK_SKEW_ABORTS.count()); + } + + private static void addClusterControllerMetrics(Set<Metric> metrics) { + // Metrics needed for alerting + addMetric(metrics, ClusterControllerMetrics.DOWN_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.MAINTENANCE_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.UP_COUNT.last()); + addMetric(metrics, ClusterControllerMetrics.IS_MASTER.last()); + addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_NODES_ABOVE_LIMIT, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last + addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_MAX_MEMORY_UTILIZATION, EnumSet.of(last, max)); // TODO: Vespa 9: Remove last + addMetric(metrics, ClusterControllerMetrics.RESOURCE_USAGE_MAX_DISK_UTILIZATION, EnumSet.of(last, max)); // TODO: Vespa 9: Remove last + } + + private static void addSentinelMetrics(Set<Metric> metrics) { + // Metrics needed for alerting + addMetric(metrics, SentinelMetrics.SENTINEL_TOTAL_RESTARTS.last()); + } + + private static void addOtherMetrics(Set<Metric> metrics) { + // Metrics needed for alerting + addMetric(metrics, NodeAdminMetrics.ENDPOINT_CERTIFICATE_EXPIRY_SECONDS.baseName()); + addMetric(metrics, NodeAdminMetrics.NODE_CERTIFICATE_EXPIRY_SECONDS.baseName()); } private static void addMetric(Set<Metric> metrics, String nameWithSuffix) { diff --git a/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java b/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java index c64be82b937..571d292b54d 100644 --- a/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java +++ b/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java @@ -73,14 +73,14 @@ public class InfrastructureMetricSet { addMetric(metrics, ConfigServerMetrics.WANT_TO_CHANGE_VESPA_VERSION.max()); addMetric(metrics, ConfigServerMetrics.HAS_WIRE_GUARD_KEY.last()); addMetric(metrics, ConfigServerMetrics.WANT_TO_DEPROVISION.max()); - addMetric(metrics, ConfigServerMetrics.SUSPENDED.max()); + addMetric(metrics, ConfigServerMetrics.SUSPENDED, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last addMetric(metrics, ConfigServerMetrics.SOME_SERVICES_DOWN.max()); addMetric(metrics, ConfigServerMetrics.NODE_FAILER_BAD_NODE.last()); addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max,average)); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU.average()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM.average()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK.average()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU, EnumSet.of(average, last)); // TODO: Vespa 9: Remove last? + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM, EnumSet.of(average, last)); // TODO: Vespa 9: Remove last? + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK, EnumSet.of(average, last)); // TODO: Vespa 9: Remove last? addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_CPU.max()); addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_MEM.max()); addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_DISK.max()); @@ -96,7 +96,10 @@ public class InfrastructureMetricSet { addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PROVISIONED_HOSTS.last()); addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_READY_HOSTS.max()); addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_RESERVED_HOSTS.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PARKED_HOSTS, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_ACTIVE_NODES.max()); addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_FAILED_NODES.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PARKED_NODES, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last addMetric(metrics, ConfigServerMetrics.RPC_SERVER_WORK_QUEUE_SIZE.average()); addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_ACTIVATE_MILLIS.last()); @@ -108,7 +111,13 @@ public class InfrastructureMetricSet { addMetric(metrics, ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE.count()); addMetric(metrics, ConfigServerMetrics.OVERCOMMITTED_HOSTS.max()); addMetric(metrics, ConfigServerMetrics.SPARE_HOST_CAPACITY.last()); - addMetric(metrics, ConfigServerMetrics.THROTTLED_NODE_FAILURES.max()); + addMetric(metrics, ConfigServerMetrics.THROTTLED_NODE_FAILURES, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last + addMetric(metrics, ConfigServerMetrics.NODE_FAIL_THROTTLING.last()); + + addMetric(metrics, ConfigServerMetrics.ORCHESTRATOR_LOCK_ACQUIRE_SUCCESS.count()); + addMetric(metrics, ConfigServerMetrics.ORCHESTRATOR_LOCK_ACQUIRE_TIMEOUT.count()); + addMetric(metrics, ConfigServerMetrics.ZONE_WORKING.last()); + addMetric(metrics, ConfigServerMetrics.THROTTLED_HOST_PROVISIONING.max()); // Container metrics that should be stored for the config-server addMetric(metrics, ContainerMetrics.HANDLED_LATENCY.max()); diff --git a/metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java b/metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java index 3ae3b002ec5..4ec596f8ce7 100644 --- a/metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java +++ b/metrics/src/main/java/ai/vespa/metrics/set/VespaMetricSet.java @@ -434,7 +434,7 @@ public class VespaMetricSet { addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY_USAGE_TRANSIENT.max()); addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY_MAPPINGS.max()); addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_OPEN_FILE_DESCRIPTORS.max()); - addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_FEEDING_BLOCKED.max()); + addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_FEEDING_BLOCKED, EnumSet.of(max,last)); // TODO: Vespa 9: Remove last addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MALLOC_ARENA.max()); addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_RESOURCE_USAGE_ADDRESS_SPACE.max()); addMetric(metrics, SearchNodeMetrics.CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_RESOURCE_USAGE_FEEDING_BLOCKED.max()); |