diff options
16 files changed, 358 insertions, 94 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/InfrastructureMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/InfrastructureMetricSet.java new file mode 100644 index 00000000000..92156c959a0 --- /dev/null +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/InfrastructureMetricSet.java @@ -0,0 +1,180 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.model.admin.monitoring; + +import ai.vespa.metrics.*; + +import java.util.Collections; +import java.util.EnumSet; +import java.util.LinkedHashSet; +import java.util.Set; + +import static ai.vespa.metrics.Suffix.average; +import static ai.vespa.metrics.Suffix.count; +import static ai.vespa.metrics.Suffix.last; +import static ai.vespa.metrics.Suffix.max; +import static ai.vespa.metrics.Suffix.sum; + +/** + * Encapsulates vespa service metrics. + * + * @author yngveaasheim + */ +public class InfrastructureMetricSet { + + public static final MetricSet infrastructureMetricSet = new MetricSet("infrastructure", + getInfrastructureMetrics()); + + private static Set<Metric> getInfrastructureMetrics() { + Set<Metric> metrics = new LinkedHashSet<>(); + + metrics.addAll(getConfigServerMetrics()); + metrics.addAll(getControllerMetrics()); + metrics.addAll(getOtherMetrics()); + + return Collections.unmodifiableSet(metrics); + } + + private static Set<Metric> getConfigServerMetrics() { + Set<Metric> metrics = new LinkedHashSet<>(); + + addMetric(metrics, ConfigServerMetrics.REQUESTS.count()); + addMetric(metrics, ConfigServerMetrics.FAILED_REQUESTS.count()); + addMetric(metrics, ConfigServerMetrics.LATENCY, EnumSet.of(max, sum, count)); + addMetric(metrics, ConfigServerMetrics.CACHE_CONFIG_ELEMS.last()); + addMetric(metrics, ConfigServerMetrics.CACHE_CHECKSUM_ELEMS.last()); + addMetric(metrics, ConfigServerMetrics.HOSTS.last()); + addMetric(metrics, ConfigServerMetrics.DELAYED_RESPONSES.count()); + addMetric(metrics, ConfigServerMetrics.SESSION_CHANGE_ERRORS.count()); + + addMetric(metrics, ConfigServerMetrics.ZK_Z_NODES, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last. + addMetric(metrics, ConfigServerMetrics.ZK_AVG_LATENCY, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last. + addMetric(metrics, ConfigServerMetrics.ZK_MAX_LATENCY, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last. + addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last. + addMetric(metrics, ConfigServerMetrics.ZK_CONNECTION_LOST.count()); + addMetric(metrics, ConfigServerMetrics.ZK_RECONNECTED.count()); + addMetric(metrics, ConfigServerMetrics.ZK_SUSPENDED.count()); + addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last. + + // Node repository metrics + addMetric(metrics, ConfigServerMetrics.NODES_NON_ACTIVE_FRACTION.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_COST.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.last()); + addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.last()); + addMetric(metrics, ConfigServerMetrics.WANT_TO_REBOOT.max()); + addMetric(metrics, ConfigServerMetrics.WANT_TO_RESTART.max()); + addMetric(metrics, ConfigServerMetrics.WANT_TO_RETIRE.max()); + addMetric(metrics, ConfigServerMetrics.RETIRED.max()); + addMetric(metrics, ConfigServerMetrics.WANT_TO_CHANGE_VESPA_VERSION.max()); + addMetric(metrics, ConfigServerMetrics.HAS_WIRE_GUARD_KEY.last()); + addMetric(metrics, ConfigServerMetrics.WANT_TO_DEPROVISION.max()); + addMetric(metrics, ConfigServerMetrics.SUSPENDED.max()); + addMetric(metrics, ConfigServerMetrics.SOME_SERVICES_DOWN.max()); + addMetric(metrics, ConfigServerMetrics.NODE_FAILER_BAD_NODE.last()); + addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max,average)); + + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU.average()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM.average()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK.average()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_CPU.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_MEM.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_DISK.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_CPU, EnumSet.of(max,average)); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_DISK, EnumSet.of(max,average)); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_MEM, EnumSet.of(max,average)); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_SKEW.last()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PENDING_REDEPLOYMENTS.last()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_ACTIVE_HOSTS.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DIRTY_HOSTS.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_FAILED_HOSTS.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_INACTIVE_HOSTS.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PROVISIONED_HOSTS.last()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_READY_HOSTS.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_RESERVED_HOSTS.max()); + addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_FAILED_NODES.max()); + + addMetric(metrics, ConfigServerMetrics.RPC_SERVER_WORK_QUEUE_SIZE.average()); + addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_ACTIVATE_MILLIS.last()); + addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_PREPARE_MILLIS.last()); + + addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max, average)); + addMetric(metrics, ConfigServerMetrics.MAINTENANCE_SUCCESS_FACTOR_DEVIATION.last()); + addMetric(metrics, ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_FAILURE.count()); + addMetric(metrics, ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE.count()); + addMetric(metrics, ConfigServerMetrics.OVERCOMMITTED_HOSTS.max()); + addMetric(metrics, ConfigServerMetrics.SPARE_HOST_CAPACITY.last()); + addMetric(metrics, ConfigServerMetrics.THROTTLED_NODE_FAILURES.max()); + + // Container metrics that should be stored for the config-server + addMetric(metrics, ContainerMetrics.HANDLED_LATENCY.max()); + addMetric(metrics, ContainerMetrics.HANDLED_REQUESTS.count()); + addMetric(metrics, ContainerMetrics.HTTP_STATUS_2XX.count()); + addMetric(metrics, ContainerMetrics.HTTP_STATUS_4XX.count()); + addMetric(metrics, ContainerMetrics.HTTP_STATUS_5XX.count()); + addMetric(metrics, ContainerMetrics.JDISC_GC_MS.last()); + addMetric(metrics, ContainerMetrics.MEM_HEAP_USED.average()); + addMetric(metrics, ContainerMetrics.SERVER_NUM_REQUESTS.count()); + addMetric(metrics, ContainerMetrics.SERVER_STARTED_MILLIS.last()); + addMetric(metrics, ContainerMetrics.SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY.last()); + + return metrics; + } + + private static Set<Metric> getControllerMetrics() { + Set<Metric> metrics = new LinkedHashSet<>(); + + addMetric(metrics, ControllerMetrics.ATHENZ_REQUEST_ERROR.count()); + addMetric(metrics, ControllerMetrics.ARCHIVE_BUCKET_COUNT.last()); + addMetric(metrics, ControllerMetrics.BILLING_TENANTS.last()); + + addMetric(metrics, ControllerMetrics.DEPLOYMENT_ABORT.count()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_AVERAGE_DURATION, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last. + addMetric(metrics, ControllerMetrics.DEPLOYMENT_CONVERGENCE_FAILURE.count()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_DEPLOYMENT_FAILURE.count()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_ERROR.count()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_FAILING_UPGRADES.last()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_FAILURE_PERCENTAGE.last()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_NODE_COUNT_BY_OS_VERSION.max()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_OS_CHANGE_DURATION.max()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_START.count()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_SUCCESS.count()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_TEST_FAILURE.count()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_WARNINGS.last()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_ENDPOINT_CERTIFICATE_TIMEOUT.count()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_BROKEN_SYSTEM_VERSION.last()); + + addMetric(metrics, ControllerMetrics.OPERATION_APPLICATION.last()); + addMetric(metrics, ControllerMetrics.OPERATION_CHANGEMANAGEMENT.last()); + addMetric(metrics, ControllerMetrics.OPERATION_CONFIGSERVER.last()); + addMetric(metrics, ControllerMetrics.OPERATION_CONTROLLER.last()); + addMetric(metrics, ControllerMetrics.OPERATION_FLAGS.last()); + addMetric(metrics, ControllerMetrics.OPERATION_OS.last()); + addMetric(metrics, ControllerMetrics.OPERATION_ROUTING.last()); + addMetric(metrics, ControllerMetrics.OPERATION_ZONE.last()); + + addMetric(metrics, ControllerMetrics.REMAINING_ROTATIONS.last()); + addMetric(metrics, ControllerMetrics.DNS_QUEUED_REQUESTS.last()); + addMetric(metrics, ControllerMetrics.ZMS_QUOTA_USAGE.last()); + addMetric(metrics, ControllerMetrics.COREDUMP_PROCESSED.count()); + + addMetric(metrics, ControllerMetrics.METERING_AGE_SECONDS.last()); + + return metrics; + } + + private static Set<Metric> getOtherMetrics() { + Set<Metric> metrics = new LinkedHashSet<>(); + + addMetric(metrics, LogdMetrics.LOGD_PROCESSED_LINES.count()); + + return metrics; + } + + private static void addMetric(Set<Metric> metrics, String nameWithSuffix) { + metrics.add(new Metric(nameWithSuffix)); + } + + private static void addMetric(Set<Metric> metrics, VespaMetrics metric, EnumSet<Suffix> suffixes) { + suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix()))); + } +} diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java index 8a2bae364a1..a0d866fb001 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java @@ -2,7 +2,6 @@ package com.yahoo.vespa.model.admin.monitoring; import ai.vespa.metrics.ClusterControllerMetrics; -import ai.vespa.metrics.ConfigServerMetrics; import ai.vespa.metrics.ContainerMetrics; import ai.vespa.metrics.DistributorMetrics; import ai.vespa.metrics.LogdMetrics; @@ -53,7 +52,6 @@ public class VespaMetricSet { metrics.addAll(getClusterControllerMetrics()); metrics.addAll(getSearchChainMetrics()); metrics.addAll(getContainerMetrics()); - metrics.addAll(getConfigServerMetrics()); metrics.addAll(getSentinelMetrics()); metrics.addAll(getOtherMetrics()); @@ -113,55 +111,6 @@ public class VespaMetricSet { return metrics; } - private static Set<Metric> getConfigServerMetrics() { - Set<Metric> metrics = new LinkedHashSet<>(); - - addMetric(metrics, ConfigServerMetrics.REQUESTS.count()); - addMetric(metrics, ConfigServerMetrics.FAILED_REQUESTS.count()); - addMetric(metrics, ConfigServerMetrics.LATENCY, EnumSet.of(max, sum, count)); - addMetric(metrics, ConfigServerMetrics.CACHE_CONFIG_ELEMS.last()); - addMetric(metrics, ConfigServerMetrics.CACHE_CHECKSUM_ELEMS.last()); - addMetric(metrics, ConfigServerMetrics.HOSTS.last()); - addMetric(metrics, ConfigServerMetrics.DELAYED_RESPONSES.count()); - addMetric(metrics, ConfigServerMetrics.SESSION_CHANGE_ERRORS.count()); - - addMetric(metrics, ConfigServerMetrics.ZK_Z_NODES.last()); - addMetric(metrics, ConfigServerMetrics.ZK_AVG_LATENCY.last()); - addMetric(metrics, ConfigServerMetrics.ZK_MAX_LATENCY.last()); - addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS.last()); - addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS.last()); - - // Node repository metrics - addMetric(metrics, ConfigServerMetrics.NODES_NON_ACTIVE_FRACTION.last()); - addMetric(metrics, ConfigServerMetrics.CLUSTER_COST.last()); - addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_CPU.last()); - addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_MEMORY.last()); - addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.last()); - addMetric(metrics, ConfigServerMetrics.WANT_TO_REBOOT.max()); - addMetric(metrics, ConfigServerMetrics.WANT_TO_RESTART.max()); - addMetric(metrics, ConfigServerMetrics.RETIRED.max()); - addMetric(metrics, ConfigServerMetrics.WANT_TO_CHANGE_VESPA_VERSION.max()); - addMetric(metrics, ConfigServerMetrics.HAS_WIRE_GUARD_KEY.last()); - addMetric(metrics, ConfigServerMetrics.WANT_TO_DEPROVISION.max()); - addMetric(metrics, ConfigServerMetrics.SUSPENDED.max()); - addMetric(metrics, ConfigServerMetrics.SOME_SERVICES_DOWN.max()); - addMetric(metrics, ConfigServerMetrics.NODE_FAILER_BAD_NODE.last()); - addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max,average)); - - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU.average()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM.average()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK.average()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_CPU.max()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_MEM.max()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_FREE_CAPACITY_DISK.max()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_CPU, EnumSet.of(max,average)); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_DISK, EnumSet.of(max,average)); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_MEM, EnumSet.of(max,average)); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_SKEW.last()); - addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PENDING_REDEPLOYMENTS.last()); - - return metrics; - } private static Set<Metric> getContainerMetrics() { Set<Metric> metrics = new LinkedHashSet<>(); diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java index 597a2da0f2c..d0a5b1bbe43 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/builder/PredefinedMetricSets.java @@ -13,6 +13,7 @@ import static com.yahoo.vespa.model.admin.monitoring.NetworkMetrics.networkMetri import static com.yahoo.vespa.model.admin.monitoring.SystemMetrics.systemMetricSet; import static com.yahoo.vespa.model.admin.monitoring.DefaultVespaMetrics.defaultVespaMetricSet; import static com.yahoo.vespa.model.admin.monitoring.VespaMetricSet.vespaMetricSet; +import static com.yahoo.vespa.model.admin.monitoring.InfrastructureMetricSet.infrastructureMetricSet; /** * A data object for predefined metric sets. @@ -27,7 +28,8 @@ public class PredefinedMetricSets { vespaMetricSet, systemMetricSet, networkMetricSet, - autoscalingMetricSet + autoscalingMetricSet, + infrastructureMetricSet ); public static Map<String, MetricSet> get() { return sets; } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java index 062133b6b6e..a62bf25bc4c 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.config.server.deploy; +import ai.vespa.metrics.ConfigServerMetrics; import com.yahoo.concurrent.UncheckedTimeoutException; import com.yahoo.config.FileReference; import com.yahoo.config.application.api.DeployLogger; @@ -105,7 +106,7 @@ public class Deployment implements com.yahoo.config.provision.Deployment { if (prepared) return; PrepareParams params = this.params.get(); - try (ActionTimer timer = applicationRepository.timerFor(params.getApplicationId(), "deployment.prepareMillis")) { + try (ActionTimer timer = applicationRepository.timerFor(params.getApplicationId(), ConfigServerMetrics.DEPLOYMENT_PREPARE_MILLIS.baseName())) { this.configChangeActions = sessionRepository().prepareLocalSession(session, deployLogger, params, clock.instant()); this.prepared = true; } catch (Exception e) { @@ -126,7 +127,7 @@ public class Deployment implements com.yahoo.config.provision.Deployment { waitForResourcesOrTimeout(params, session, provisioner); ApplicationId applicationId = session.getApplicationId(); - try (ActionTimer timer = applicationRepository.timerFor(applicationId, "deployment.activateMillis")) { + try (ActionTimer timer = applicationRepository.timerFor(applicationId, ConfigServerMetrics.DEPLOYMENT_ACTIVATE_MILLIS.baseName())) { TimeoutBudget timeoutBudget = params.getTimeoutBudget(); timeoutBudget.assertNotTimedOut(() -> "Timeout exceeded when trying to activate '" + applicationId + "'"); diff --git a/container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java b/container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java index af98e380f2a..33fa0bd7bab 100644 --- a/container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java +++ b/container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java @@ -290,7 +290,7 @@ public class StateHandler extends AbstractRequestHandler implements CapabilityRe Tuple latencySeconds = new Tuple(NULL_DIMENSIONS, "latencySeconds", null); for (Map.Entry<MetricDimensions, MetricSet> entry : snapshot) { MetricSet metricSet = entry.getValue(); - MetricValue val = metricSet.get(ContainerMetrics.SERVER_TOTAL_SUCCESFUL_RESPONSE_LATENCY.baseName()); + MetricValue val = metricSet.get(ContainerMetrics.SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY.baseName()); if (val instanceof GaugeMetric gauge) { latencySeconds.add(GaugeMetric.newInstance(gauge.getLast() / 1000, gauge.getMax() / 1000, diff --git a/container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java b/container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java index 327640cb7ed..2a382d22a68 100644 --- a/container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java +++ b/container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java @@ -43,7 +43,7 @@ class MetricDefinitions { static final String NUM_SUCCESSFUL_WRITES = ContainerMetrics.SERVER_NUM_SUCCESSFUL_RESPONSE_WRITES.baseName(); static final String NUM_FAILED_WRITES = ContainerMetrics.SERVER_NUM_FAILED_RESPONSE_WRITES.baseName(); - static final String TOTAL_SUCCESSFUL_LATENCY = ContainerMetrics.SERVER_TOTAL_SUCCESFUL_RESPONSE_LATENCY.baseName(); + static final String TOTAL_SUCCESSFUL_LATENCY = ContainerMetrics.SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY.baseName(); static final String TOTAL_FAILED_LATENCY = ContainerMetrics.SERVER_TOTAL_FAILED_RESPONSE_LATENCY.baseName(); static final String TIME_TO_FIRST_BYTE = ContainerMetrics.SERVER_TIME_TO_FIRST_BYTE.baseName(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/athenz/impl/AthenzClientFactoryImpl.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/athenz/impl/AthenzClientFactoryImpl.java index c88eb2f1b86..aa50f9d3a87 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/athenz/impl/AthenzClientFactoryImpl.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/athenz/impl/AthenzClientFactoryImpl.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.athenz.impl; +import ai.vespa.metrics.ControllerMetrics; import com.yahoo.component.annotation.Inject; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.athenz.api.AthenzIdentity; @@ -22,7 +23,7 @@ import java.util.Map; */ public class AthenzClientFactoryImpl implements AthenzClientFactory { - private static final String METRIC_NAME = "athenz.request.error"; + private static final String METRIC_NAME = ControllerMetrics.ATHENZ_REQUEST_ERROR.baseName(); private static final String ATHENZ_SERVICE_DIMENSION = "athenz-service"; private static final String EXCEPTION_DIMENSION = "exception"; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java index b9bff5f777e..2924bb83104 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.deployment; +import ai.vespa.metrics.ControllerMetrics; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId; @@ -13,19 +14,19 @@ import java.util.Map; */ public class JobMetrics { - public static final String start = "deployment.start"; - public static final String nodeAllocationFailure = "deployment.nodeAllocationFailure"; - public static final String endpointCertificateTimeout = "deployment.endpointCertificateTimeout"; - public static final String deploymentFailure = "deployment.deploymentFailure"; - public static final String invalidApplication = "deployment.invalidApplication"; - public static final String convergenceFailure = "deployment.convergenceFailure"; - public static final String testFailure = "deployment.testFailure"; - public static final String noTests = "deployment.noTests"; - public static final String error = "deployment.error"; - public static final String abort = "deployment.abort"; - public static final String cancel = "deployment.cancel"; - public static final String success = "deployment.success"; - public static final String quotaExceeded = "deployment.quotaExceeded"; + public static final String start = ControllerMetrics.DEPLOYMENT_START.baseName(); + public static final String nodeAllocationFailure = ControllerMetrics.DEPLOYMENT_NODE_ALLOCATION_FAILURE.baseName(); + public static final String endpointCertificateTimeout = ControllerMetrics.DEPLOYMENT_ENDPOINT_CERTIFICATE_TIMEOUT.baseName(); + public static final String deploymentFailure = ControllerMetrics.DEPLOYMENT_DEPLOYMENT_FAILURE.baseName(); + public static final String invalidApplication = ControllerMetrics.DEPLOYMENT_INVALID_APPLICATION.baseName(); + public static final String convergenceFailure = ControllerMetrics.DEPLOYMENT_CONVERGENCE_FAILURE.baseName(); + public static final String testFailure = ControllerMetrics.DEPLOYMENT_TEST_FAILURE.baseName(); + public static final String noTests = ControllerMetrics.DEPLOYMENT_NO_TESTS.baseName(); + public static final String error = ControllerMetrics.DEPLOYMENT_ERROR.baseName(); + public static final String abort = ControllerMetrics.DEPLOYMENT_ABORT.baseName(); + public static final String cancel = ControllerMetrics.DEPLOYMENT_CANCEL.baseName(); + public static final String success = ControllerMetrics.DEPLOYMENT_SUCCESS.baseName(); + public static final String quotaExceeded = ControllerMetrics.DEPLOYMENT_QUOTA_EXCEEDED.baseName(); private final Metric metric; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java index b2ed0941c8e..33a4802360e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import ai.vespa.metrics.ControllerMetrics; import com.yahoo.config.provision.TenantName; import com.yahoo.config.provision.zone.ZoneId; import com.yahoo.jdisc.Metric; @@ -24,7 +25,7 @@ import java.util.stream.Collectors; */ public class ArchiveAccessMaintainer extends ControllerMaintainer { - private static final String bucketCountMetricName = "archive.bucketCount"; + private static final String bucketCountMetricName = ControllerMetrics.ARCHIVE_BUCKET_COUNT.baseName(); private final CuratorArchiveBucketDb archiveBucketDb; private final ArchiveService archiveService; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 71f9c37577a..6a280e71e98 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -1,6 +1,8 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import ai.vespa.metrics.ConfigServerMetrics; +import ai.vespa.metrics.ControllerMetrics; import com.yahoo.component.Version; import com.yahoo.config.application.api.DeploymentInstanceSpec; import com.yahoo.config.provision.ApplicationId; @@ -48,22 +50,22 @@ import java.util.stream.Collectors; */ public class MetricsReporter extends ControllerMaintainer { - public static final String TENANT_METRIC = "billing.tenants"; - public static final String DEPLOYMENT_FAIL_METRIC = "deployment.failurePercentage"; - public static final String DEPLOYMENT_AVERAGE_DURATION = "deployment.averageDuration"; - public static final String DEPLOYMENT_FAILING_UPGRADES = "deployment.failingUpgrades"; - public static final String DEPLOYMENT_BUILD_AGE_SECONDS = "deployment.buildAgeSeconds"; - public static final String DEPLOYMENT_WARNINGS = "deployment.warnings"; - public static final String DEPLOYMENT_OVERDUE_UPGRADE = "deployment.overdueUpgradeSeconds"; - public static final String OS_CHANGE_DURATION = "deployment.osChangeDuration"; - public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration"; - public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion"; - public static final String PLATFORM_NODE_COUNT = "deployment.nodeCountByPlatformVersion"; - public static final String BROKEN_SYSTEM_VERSION = "deployment.brokenSystemVersion"; - public static final String REMAINING_ROTATIONS = "remaining_rotations"; - public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests"; + public static final String TENANT_METRIC = ControllerMetrics.BILLING_TENANTS.baseName(); + public static final String DEPLOYMENT_FAIL_METRIC = ControllerMetrics.DEPLOYMENT_FAILURE_PERCENTAGE.baseName(); + public static final String DEPLOYMENT_AVERAGE_DURATION = ControllerMetrics.DEPLOYMENT_AVERAGE_DURATION.baseName(); + public static final String DEPLOYMENT_FAILING_UPGRADES = ControllerMetrics.DEPLOYMENT_FAILING_UPGRADES.baseName(); + public static final String DEPLOYMENT_BUILD_AGE_SECONDS = ControllerMetrics.DEPLOYMENT_BUILD_AGE_SECONDS.baseName(); + public static final String DEPLOYMENT_WARNINGS = ControllerMetrics.DEPLOYMENT_WARNINGS.baseName(); + public static final String DEPLOYMENT_OVERDUE_UPGRADE = ControllerMetrics.DEPLOYMENT_OVERDUE_UPGRADE_SECONDS.baseName(); + public static final String OS_CHANGE_DURATION = ControllerMetrics.DEPLOYMENT_OS_CHANGE_DURATION.baseName(); + public static final String PLATFORM_CHANGE_DURATION = ControllerMetrics.DEPLOYMENT_PLATFORM_CHANGE_DURATION.baseName(); + public static final String OS_NODE_COUNT = ControllerMetrics.DEPLOYMENT_NODE_COUNT_BY_OS_VERSION.baseName(); + public static final String PLATFORM_NODE_COUNT = ControllerMetrics.DEPLOYMENT_NODE_COUNT_BY_PLATFORM_VERSION.baseName(); + public static final String BROKEN_SYSTEM_VERSION = ControllerMetrics.DEPLOYMENT_BROKEN_SYSTEM_VERSION.baseName(); + public static final String REMAINING_ROTATIONS = ControllerMetrics.REMAINING_ROTATIONS.baseName(); + public static final String NAME_SERVICE_REQUESTS_QUEUED = ControllerMetrics.DNS_QUEUED_REQUESTS.baseName(); public static final String OPERATION_PREFIX = "operation."; - public static final String ZMS_QUOTA_USAGE = "zms.quota.usage"; + public static final String ZMS_QUOTA_USAGE = ControllerMetrics.ZMS_QUOTA_USAGE.baseName(); private final Metric metric; private final Clock clock; diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java index 9936b4612c5..ca028547171 100644 --- a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java +++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java @@ -27,8 +27,11 @@ public enum ConfigServerMetrics implements VespaMetrics { MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE("maintenanceDeployment.transientFailure", Unit.OPERATION, "Number of maintenance deployments that failed with a transient failure"), MAINTENANCE_DEPLOYMENT_FAILURE("maintenanceDeployment.failure", Unit.OPERATION, "Number of maintenance deployments that failed with a permanent failure"), + MAINTENANCE_SUCCESS_FACTOR_DEVIATION("maintenance.successFactorDeviation", Unit.FRACTION, "Configserver: Maintenance Success Factor Deviation"), + MAINTENANCE_DURATION("maintenance.duration", Unit.MILLISECOND, "Configserver: Maintenance Duration"), + // ZooKeeper related metrics - ZK_CONNECTIONS_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"), + ZK_CONNECTION_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"), ZK_RECONNECTED("configserver.zkReconnected", Unit.CONNECTION, "Number of ZooKeeper reconnections"), ZK_CONNECTED("configserver.zkConnected", Unit.NODE, "Number of ZooKeeper nodes connected"), ZK_SUSPENDED("configserver.zkSuspended", Unit.NODE, "Number of ZooKeeper nodes suspended"), @@ -107,9 +110,41 @@ public enum ConfigServerMetrics implements VespaMetrics { HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU("hostedVespa.docker.allocatedCapacityCpu", Unit.VCPU, "Total number of allocated VCPUs on tenant hosts managed by hosted Vespa in a zone"), HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM("hostedVespa.docker.allocatedCapacityMem", Unit.GIGABYTE, "Total amount of allocated memory on tenant hosts managed by hosted Vespa in a zone"), HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK("hostedVespa.docker.allocatedCapacityDisk", Unit.GIGABYTE, "Total amount of allocated disk space on tenant hosts managed by hosted Vespa in a zone"), - HOSTED_VESPA_BREAKFIXED_HOSTS("hostedVespa.breakfixedHosts", Unit.HOST, "Number of hosts managed that are breakfixed in a zone"), HOSTED_VESPA_PENDING_REDEPLOYMENTS("hostedVespa.pendingRedeployments", Unit.TASK, "The number of hosted Vespa re-deployments pending"), - HOSTED_VESPA_DOCKER_SKEW("hostedVespa.docker.skew", Unit.FRACTION, "A number in the range 0..1 indicating how well allocated resources are balanced with availability on hosts"); + HOSTED_VESPA_DOCKER_SKEW("hostedVespa.docker.skew", Unit.FRACTION, "A number in the range 0..1 indicating how well allocated resources are balanced with availability on hosts"), + + HOSTED_VESPA_ACTIVE_HOSTS("hostedVespa.activeHosts", Unit.HOST, "The number of managed hosts that are in state \"active\""), + HOSTED_VESPA_BREAKFIXED_HOSTS("hostedVespa.breakfixedHosts", Unit.HOST, "The number of managed hosts that are in state \"breakfixed\""), + HOSTED_VESPA_DEPROVISIONED_HOSTS("hostedVespa.deprovisionedHosts", Unit.HOST, "The number of managed hosts that are in state \"deprovisioned\""), + HOSTED_VESPA_DIRTY_HOSTS("hostedVespa.dirtyHosts", Unit.HOST, "The number of managed hosts that are in state \"dirty\""), + HOSTED_VESPA_FAILED_HOSTS("hostedVespa.failedHosts", Unit.HOST, "The number of managed hosts that are in state \"failed\""), + HOSTED_VESPA_INACTIVE_HOSTS("hostedVespa.inactiveHosts", Unit.HOST, "The number of managed hosts that are in state \"inactive\""), + HOSTED_VESPA_PARKED_HOSTS("hostedVespa.parkedHosts", Unit.HOST, "The number of managed hosts that are in state \"parked\""), + HOSTED_VESPA_PROVISIONED_HOSTS("hostedVespa.provisionedHosts", Unit.HOST, "The number of managed hosts that are in state \"provisioned\""), + HOSTED_VESPA_READY_HOSTS("hostedVespa.readyHosts", Unit.HOST, "The number of managed hosts that are in state \"ready\""), + HOSTED_VESPA_RESERVED_HOSTS("hostedVespa.reservedHosts", Unit.HOST, "The number of managed hosts that are in state \"reserved\""), + + HOSTED_VESPA_ACTIVE_NODES("hostedVespa.activeNodes", Unit.HOST, "The number of managed nodes that are in state \"active\""), + HOSTED_VESPA_BREAKFIXED_NODES("hostedVespa.breakfixedNodes", Unit.HOST, "The number of managed nodes that are in state \"breakfixed\""), + HOSTED_VESPA_DEPROVISIONED_NODES("hostedVespa.deprovisionedNodes", Unit.HOST, "The number of managed nodes that are in state \"deprovisioned\""), + HOSTED_VESPA_DIRTY_NODES("hostedVespa.dirtyNodes", Unit.HOST, "The number of managed nodes that are in state \"dirty\""), + HOSTED_VESPA_FAILED_NODES("hostedVespa.failedNodes", Unit.HOST, "The number of managed nodes that are in state \"failed\""), + HOSTED_VESPA_INACTIVE_NODES("hostedVespa.inactiveNodes", Unit.HOST, "The number of managed nodes that are in state \"inactive\""), + HOSTED_VESPA_PARKED_NODES("hostedVespa.parkedNodes", Unit.HOST, "The number of managed nodes that are in state \"parked\""), + HOSTED_VESPA_PROVISIONED_NODES("hostedVespa.provisionedNodes", Unit.HOST, "The number of managed nodes that are in state \"provisioned\""), + HOSTED_VESPA_READY_NODES("hostedVespa.readyNodes", Unit.HOST, "The number of managed nodes that are in state \"ready\""), + HOSTED_VESPA_RESERVED_NODES("hostedVespa.reservedNodes", Unit.HOST, "The number of managed nodes that are in state \"reserved\""), + + + OVERCOMMITTED_HOSTS("overcommittedHosts", Unit.HOST, "The number of hosts with over-committed resources"), + SPARE_HOST_CAPACITY("spareHostCapacity", Unit.HOST, "The number of spare hosts"), + THROTTLED_HOST_FAILURES("throttledHostFailures", Unit.HOST, "Number of host failures stopped due to throttling"), + THROTTLED_NODE_FAILURES("throttledNodeFailures", Unit.HOST, "Number of node failures stopped due to throttling"), + NODE_FAIL_THROTTLING("nodeFailThrottling", Unit.BINARY, "Metric indicating when node failure throttling is active. The value 1 means active, 0 means inactive"), + + DEPLOYMENT_PREPARE_MILLIS("deployment.prepareMillis", Unit.MILLISECOND, "Duration of deployment preparations"), + DEPLOYMENT_ACTIVATE_MILLIS("deployment.activateMillis", Unit.MILLISECOND, "Duration of deployment activations"); + private final String name; private final Unit unit; diff --git a/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java index ab3fb9b6197..98bd6230762 100644 --- a/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java +++ b/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java @@ -196,7 +196,7 @@ public enum ContainerMetrics implements VespaMetrics { SERVER_NUM_SUCCESSFUL_RESPONSE_WRITES("serverNumSuccessfulResponseWrites", Unit.REQUEST, "Number of successful response writes"), SERVER_NUM_FAILED_RESPONSE_WRITES("serverNumFailedResponseWrites", Unit.REQUEST, "Number of failed response writes"), - SERVER_TOTAL_SUCCESFUL_RESPONSE_LATENCY("serverTotalSuccessfulResponseLatency", Unit.MILLISECOND, "Total duration for execution of successful responses"), + SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY("serverTotalSuccessfulResponseLatency", Unit.MILLISECOND, "Total duration for execution of successful responses"), SERVER_TOTAL_FAILED_RESPONSE_LATENCY("serverTotalFailedResponseLatency", Unit.MILLISECOND, "Total duration for execution of failed responses"), SERVER_TIME_TO_FIRST_BYTE("serverTimeToFirstByte", Unit.MILLISECOND, "Time from request has been received by the server until the first byte is returned to the client"), diff --git a/metrics/src/main/java/ai/vespa/metrics/ControllerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ControllerMetrics.java new file mode 100644 index 00000000000..4770fe51830 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/ControllerMetrics.java @@ -0,0 +1,82 @@ +package ai.vespa.metrics; + +/** + * @author yngveaasheim + */ +public enum ControllerMetrics implements VespaMetrics { + + ATHENZ_REQUEST_ERROR("athenz.request.error", Unit.REQUEST, "Controller: Athenz request error"), + ARCHIVE_BUCKET_COUNT("archive.bucketCount", Unit.BUCKET, "Controller: Archive bucket count"), + + DEPLOYMENT_START("deployment.start", Unit.DEPLOYMENT, "The number of started deployment jobs"), + DEPLOYMENT_NODE_ALLOCATION_FAILURE("deployment.nodeAllocationFailure", Unit.DEPLOYMENT, "The number of deployments failed due to node allocation failures"), + DEPLOYMENT_ENDPOINT_CERTIFICATE_TIMEOUT("deployment.endpointCertificateTimeout", Unit.DEPLOYMENT, "The number of deployments failed due to timeout acquiring endpoint certificate"), + DEPLOYMENT_DEPLOYMENT_FAILURE("deployment.deploymentFailure", Unit.DEPLOYMENT, "The number of deployments that failed"), + DEPLOYMENT_INVALID_APPLICATION("deployment.invalidApplication", Unit.DEPLOYMENT, "Deployments with invalid application package"), + DEPLOYMENT_CONVERGENCE_FAILURE("deployment.convergenceFailure", Unit.DEPLOYMENT, "The number of deployments with convergence failure"), + DEPLOYMENT_TEST_FAILURE("deployment.testFailure", Unit.DEPLOYMENT, "The number of test deployments with test failure"), + DEPLOYMENT_NO_TESTS("deployment.noTests", Unit.DEPLOYMENT, "Deployments with no tests"), + DEPLOYMENT_ERROR("deployment.error", Unit.DEPLOYMENT, "Deployments with error"), + DEPLOYMENT_ABORT("deployment.abort", Unit.DEPLOYMENT, "Deployments that were aborted"), + DEPLOYMENT_CANCEL("deployment.cancel", Unit.DEPLOYMENT, "Deployments that were canceled"), + DEPLOYMENT_SUCCESS("deployment.success", Unit.DEPLOYMENT, "Successful deployments"), + DEPLOYMENT_QUOTA_EXCEEDED("deployment.quotaExceeded", Unit.DEPLOYMENT, "Deployments stopped due to exceeding quota"), + BILLING_TENANTS("billing.tenants", Unit.TENANT, "Billing tenants"), + DEPLOYMENT_FAILURE_PERCENTAGE("deployment.failurePercentage", Unit.PERCENTAGE, "Deployment: Failure percentage"), + DEPLOYMENT_AVERAGE_DURATION("deployment.averageDuration", Unit.SECOND, "Deployment duration"), + DEPLOYMENT_FAILING_UPGRADES("deployment.failingUpgrades", Unit.DEPLOYMENT, "Deployment: Failing upgrades"), + DEPLOYMENT_BUILD_AGE_SECONDS("deployment.buildAgeSeconds", Unit.SECOND, "Deployment: The age of a build deployed"), + DEPLOYMENT_WARNINGS("deployment.warnings", Unit.ITEM, "The number of application related warnings during deployments"), + DEPLOYMENT_OVERDUE_UPGRADE_SECONDS("deployment.overdueUpgradeSeconds", Unit.SECOND, "Deployment: Overdue upgrade period"), + DEPLOYMENT_OS_CHANGE_DURATION("deployment.osChangeDuration", Unit.SECOND, "Deployment: OS change duration"), + DEPLOYMENT_PLATFORM_CHANGE_DURATION("deployment.platformChangeDuration", Unit.SECOND, "Deployment: Platform change duration"), + DEPLOYMENT_NODE_COUNT_BY_OS_VERSION("deployment.nodeCountByOsVersion", Unit.NODE, "Deployment: Node count by OS version"), + DEPLOYMENT_NODE_COUNT_BY_PLATFORM_VERSION("deployment.nodeCountByPlatformVersion", Unit.NODE, "Deployment: Node count by platform version"), + DEPLOYMENT_BROKEN_SYSTEM_VERSION("deployment.brokenSystemVersion", Unit.BINARY, "Deployment: Value 1 for broken system versions, 0 if not"), + REMAINING_ROTATIONS("remaining_rotations", Unit.ROTATION, "Remaining rotations"), + DNS_QUEUED_REQUESTS("dns.queuedRequests", Unit.REQUEST, "Queued DNS requests"), + ZMS_QUOTA_USAGE("zms.quota.usage", Unit.FRACTION, "ZMS Quota usage per resource type"), + COREDUMP_PROCESSED("coredump.processed", Unit.FAILURE,"Controller: Core dumps processed"), + + // Metrics per API, metrics names generated in ControllerMaintainer/MetricsReporter + OPERATION_APPLICATION("operation.application", Unit.REQUEST, "Controller: Requests for /application API"), + OPERATION_CHANGEMANAGEMENT("operation.changemanagement", Unit.REQUEST, "Controller: Requests for /changemanagement API"), + OPERATION_CONFIGSERVER("operation.configserver", Unit.REQUEST, "Controller: Requests for /configserver API"), + OPERATION_CONTROLLER("operation.controller", Unit.REQUEST, "Controller: Requests for /controller API"), + OPERATION_FLAGS("operation.flags", Unit.REQUEST, "Controller: Requests for /flags API"), + OPERATION_OS("operation.os", Unit.REQUEST, "Controller: Requests for /os API"), + OPERATION_ROUTING("operation.routing", Unit.REQUEST, "Controller: Requests for /routing API"), + OPERATION_ZONE("operation.zone", Unit.REQUEST, "Controller: Requests for /zone API"), + + // Metering metrics - not used - TODO: remove from controller code. + METERING_AGE_SECONDS("metering.age.seconds", Unit.SECOND, "Controller: Metering age seconds"), + METERING_COST_HOURLY("metering.cost.hourly", Unit.DOLLAR_PER_HOUR, "Controller: Metering cost hourly"), + METERING_DISK_GB("metering.diskGB", Unit.GIGABYTE, "Controller: Metering disk GB"), + METERING_MEMORY_GB("metering.memoryGB", Unit.GIGABYTE, "Controller: Metering memory GB"), + METERING_VCPU("metering.vcpu", Unit.VCPU, "Controller: Metering VCPU"), + METERING_LAST_REPORTED("metering_last_reported", Unit.SECONDS_SINCE_EPOCH, "Controller: Metering last reported"), + METERING_TOTAL_REPORTED("metering_total_reported", Unit.ITEM, "Controller: Metering total reported (sum of resources)"); + + private final String name; + private final Unit unit; + private final String description; + + ControllerMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/Unit.java b/metrics/src/main/java/ai/vespa/metrics/Unit.java index d514b9e9839..48b4e72891f 100644 --- a/metrics/src/main/java/ai/vespa/metrics/Unit.java +++ b/metrics/src/main/java/ai/vespa/metrics/Unit.java @@ -10,6 +10,7 @@ public enum Unit { BYTE(BaseUnit.BYTE, "A collection of 8 bits"), BYTE_PER_SECOND(BaseUnit.BYTE, BaseUnit.SECOND, "A unit of storage capable of holding 8 bits"), CONNECTION(BaseUnit.CONNECTION, "A link used for communication between a client and a server"), + DEPLOYMENT(BaseUnit.DEPLOYMENT, "A deployment on hosted Vespa"), DOCUMENT(BaseUnit.DOCUMENT, "Vespa document, a collection of fields defined in a schema file"), DOCUMENTID(BaseUnit.DOCUMENTID, "A unique document identifier"), DOLLAR_PER_HOUR(BaseUnit.DOLLAR, BaseUnit.HOUR, "Total current cost of the cluster in $/hr"), @@ -36,10 +37,13 @@ public enum Unit { REQUEST(BaseUnit.REQUEST, "A request sent from a client to a server"), RESPONSE(BaseUnit.RESPONSE, "A response from a server to a client, typically as a response to a request"), RESTART(BaseUnit.RESTART, "A service or node restarts"), + ROTATION(BaseUnit.ROTATION, "Routing rotation"), SCORE(BaseUnit.SCORE, "Relevance score for a document"), SECOND(BaseUnit.SECOND, "Time span of 1 second"), + SECONDS_SINCE_EPOCH(BaseUnit.SECONDS_SINCE_EPOCH,"Seconds since Unix Epoch"), SESSION(BaseUnit.SESSION, "A set of operations taking place during one connection or as part of a higher level operation"), TASK(BaseUnit.TASK, "Piece of work executed by a server, e.g. to perform back-ground data maintenance"), + TENANT(BaseUnit.TENANT, "Tenant that owns zero or more applications in a managed Vespa system"), THREAD(BaseUnit.THREAD, "Computer thread for executing e.g. tasks, operations or queries"), VCPU(BaseUnit.VCPU,"Virtual CPU"), @@ -84,6 +88,7 @@ public enum Unit { BUCKET("bucket"), BYTE("byte"), CONNECTION("connection"), + DEPLOYMENT("deployment"), DOCUMENT("document"), DOCUMENTID("documentid"), DOLLAR("dollar"), @@ -108,10 +113,13 @@ public enum Unit { REQUEST("request"), RESPONSE("response"), RESTART("restart"), + ROTATION("routing rotation"), SCORE("score"), SECOND("second", "s"), + SECONDS_SINCE_EPOCH("seconds since epoch"), SESSION("session"), TASK("task"), + TENANT("tenant"), THREAD("thread"), VCPU("vcpu"), VERSION("version"), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 585a7f341b5..5673b2d74ea 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import ai.vespa.metrics.ConfigServerMetrics; import com.yahoo.concurrent.UncheckedTimeoutException; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; @@ -45,10 +46,10 @@ public class NodeFailer extends NodeRepositoryMaintainer { private static final Logger log = Logger.getLogger(NodeFailer.class.getName()); /** Metric for number of hosts that we want to fail, but cannot due to throttling */ - static final String throttledHostFailuresMetric = "throttledHostFailures"; + static final String throttledHostFailuresMetric = ConfigServerMetrics.THROTTLED_HOST_FAILURES.baseName(); /** Metric for number of nodes that we want to fail, but cannot due to throttling */ - static final String throttledNodeFailuresMetric = "throttledNodeFailures"; + static final String throttledNodeFailuresMetric = ConfigServerMetrics.THROTTLED_NODE_FAILURES.baseName(); /** Metric that indicates whether throttling is active where 1 means active and 0 means inactive */ static final String throttlingActiveMetric = "nodeFailThrottling"; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index dcdcbf09175..da05656fcee 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import ai.vespa.metrics.ConfigServerMetrics; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Deployer; @@ -75,7 +76,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { CapacityChecker capacityChecker = new CapacityChecker(allNodes); List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts(); - metric.set("overcommittedHosts", overcommittedHosts.size(), null); + metric.set(ConfigServerMetrics.OVERCOMMITTED_HOSTS.baseName(), overcommittedHosts.size(), null); retireOvercommitedHosts(allNodes, overcommittedHosts); boolean success = true; @@ -93,7 +94,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { success = false; } } - metric.set("spareHostCapacity", spareHostCapacity, null); + metric.set(ConfigServerMetrics.SPARE_HOST_CAPACITY.baseName(), spareHostCapacity, null); } return success ? 1.0 : 0.0; } |