aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java75
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java5
-rw-r--r--container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java2
-rw-r--r--container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java27
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java13
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java75
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java2
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/Unit.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java5
11 files changed, 181 insertions, 36 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
index 8a2bae364a1..3d0edf303c3 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java
@@ -125,11 +125,14 @@ public class VespaMetricSet {
addMetric(metrics, ConfigServerMetrics.DELAYED_RESPONSES.count());
addMetric(metrics, ConfigServerMetrics.SESSION_CHANGE_ERRORS.count());
- addMetric(metrics, ConfigServerMetrics.ZK_Z_NODES.last());
- addMetric(metrics, ConfigServerMetrics.ZK_AVG_LATENCY.last());
- addMetric(metrics, ConfigServerMetrics.ZK_MAX_LATENCY.last());
- addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS.last());
- addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS.last());
+ addMetric(metrics, ConfigServerMetrics.ZK_Z_NODES, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.ZK_AVG_LATENCY, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.ZK_MAX_LATENCY, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.ZK_CONNECTIONS, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.ZK_CONNECTION_LOST.count());
+ addMetric(metrics, ConfigServerMetrics.ZK_RECONNECTED.count());
+ addMetric(metrics, ConfigServerMetrics.ZK_SUSPENDED.count());
+ addMetric(metrics, ConfigServerMetrics.ZK_OUTSTANDING_REQUESTS, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
// Node repository metrics
addMetric(metrics, ConfigServerMetrics.NODES_NON_ACTIVE_FRACTION.last());
@@ -139,6 +142,7 @@ public class VespaMetricSet {
addMetric(metrics, ConfigServerMetrics.CLUSTER_LOAD_IDEAL_DISK.last());
addMetric(metrics, ConfigServerMetrics.WANT_TO_REBOOT.max());
addMetric(metrics, ConfigServerMetrics.WANT_TO_RESTART.max());
+ addMetric(metrics, ConfigServerMetrics.WANT_TO_RETIRE.max());
addMetric(metrics, ConfigServerMetrics.RETIRED.max());
addMetric(metrics, ConfigServerMetrics.WANT_TO_CHANGE_VESPA_VERSION.max());
addMetric(metrics, ConfigServerMetrics.HAS_WIRE_GUARD_KEY.last());
@@ -159,6 +163,67 @@ public class VespaMetricSet {
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_MEM, EnumSet.of(max,average));
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_SKEW.last());
addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PENDING_REDEPLOYMENTS.last());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_SKEW.last());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_CPU, EnumSet.of(max,average));
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_DISK, EnumSet.of(max,average));
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_MEM, EnumSet.of(max,average));
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PENDING_REDEPLOYMENTS.last());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_ACTIVE_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_DIRTY_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_FAILED_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_INACTIVE_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_PROVISIONED_HOSTS.last());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_READY_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.HOSTED_VESPA_RESERVED_HOSTS.max());
+
+ addMetric(metrics, ConfigServerMetrics.RPC_SERVER_WORK_QUEUE_SIZE.average());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_ACTIVATE_MILLIS.last());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_PREPARE_MILLIS.last());
+
+ addMetric(metrics, ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD, EnumSet.of(max, average));
+ addMetric(metrics, ConfigServerMetrics.MAINTENANCE_SUCCESS_FACTOR_DEVIATION.last());
+ addMetric(metrics, ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_FAILURE.count());
+ addMetric(metrics, ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE.count());
+ addMetric(metrics, ConfigServerMetrics.OVERCOMMITTED_HOSTS.max());
+ addMetric(metrics, ConfigServerMetrics.SPARE_HOST_CAPACITY.last());
+ addMetric(metrics, ConfigServerMetrics.THROTTLED_NODE_FAILURES.max());
+
+ // Controller specific metrics
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_ABORT.count());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_AVERAGE_DURATION, EnumSet.of(max, last)); // TODO: Vespa 9: Remove last.
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_CONVERGENCE_FAILURE.count());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_DEPLOYMENT_FAILURE.count());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_ERROR.count());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_FAILING_UPGRADES.last());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_FAILURE_PERCENTAGE.last());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_NODE_COUNT_BY_OS_VERSION.max());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_OS_CHANGE_DURATION.max());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_START.count());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_SUCCESS.count());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_TEST_FAILURE.count());
+ addMetric(metrics, ConfigServerMetrics.DEPLOYMENT_WARNINGS.last());
+
+ addMetric(metrics, ConfigServerMetrics.OPERATION_API.last());
+ addMetric(metrics, ConfigServerMetrics.OPERATION_APPLICATION.last());
+ addMetric(metrics, ConfigServerMetrics.OPERATION_CONFIGSERVER.last());
+ addMetric(metrics, ConfigServerMetrics.OPERATION_CONTROLLER.last());
+ addMetric(metrics, ConfigServerMetrics.OPERATION_FLAGS.last());
+ addMetric(metrics, ConfigServerMetrics.OPERATION_OS.last());
+ addMetric(metrics, ConfigServerMetrics.OPERATION_ROUTING.last());
+ addMetric(metrics, ConfigServerMetrics.OPERATION_ZONE.last());
+ addMetric(metrics, ConfigServerMetrics.ZMS_QUOTA_USAGE.last());
+
+ // Container metrics that should be stored for the config-server
+ addMetric(metrics, ContainerMetrics.HANDLED_LATENCY.max());
+ addMetric(metrics, ContainerMetrics.HANDLED_REQUESTS.count());
+ addMetric(metrics, ContainerMetrics.HTTP_STATUS_2XX.count());
+ addMetric(metrics, ContainerMetrics.HTTP_STATUS_4XX.count());
+ addMetric(metrics, ContainerMetrics.HTTP_STATUS_5XX.count());
+ addMetric(metrics, ContainerMetrics.JDISC_GC_MS.last());
+ addMetric(metrics, ContainerMetrics.MEM_HEAP_USED.average());
+ addMetric(metrics, ContainerMetrics.SERVER_NUM_REQUESTS.count());
+ addMetric(metrics, ContainerMetrics.SERVER_STARTED_MILLIS.last());
+ addMetric(metrics, ContainerMetrics.SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY.last());
return metrics;
}
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
index 062133b6b6e..a62bf25bc4c 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.config.server.deploy;
+import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.FileReference;
import com.yahoo.config.application.api.DeployLogger;
@@ -105,7 +106,7 @@ public class Deployment implements com.yahoo.config.provision.Deployment {
if (prepared) return;
PrepareParams params = this.params.get();
- try (ActionTimer timer = applicationRepository.timerFor(params.getApplicationId(), "deployment.prepareMillis")) {
+ try (ActionTimer timer = applicationRepository.timerFor(params.getApplicationId(), ConfigServerMetrics.DEPLOYMENT_PREPARE_MILLIS.baseName())) {
this.configChangeActions = sessionRepository().prepareLocalSession(session, deployLogger, params, clock.instant());
this.prepared = true;
} catch (Exception e) {
@@ -126,7 +127,7 @@ public class Deployment implements com.yahoo.config.provision.Deployment {
waitForResourcesOrTimeout(params, session, provisioner);
ApplicationId applicationId = session.getApplicationId();
- try (ActionTimer timer = applicationRepository.timerFor(applicationId, "deployment.activateMillis")) {
+ try (ActionTimer timer = applicationRepository.timerFor(applicationId, ConfigServerMetrics.DEPLOYMENT_ACTIVATE_MILLIS.baseName())) {
TimeoutBudget timeoutBudget = params.getTimeoutBudget();
timeoutBudget.assertNotTimedOut(() -> "Timeout exceeded when trying to activate '" + applicationId + "'");
diff --git a/container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java b/container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java
index af98e380f2a..33fa0bd7bab 100644
--- a/container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java
+++ b/container-core/src/main/java/com/yahoo/container/jdisc/state/StateHandler.java
@@ -290,7 +290,7 @@ public class StateHandler extends AbstractRequestHandler implements CapabilityRe
Tuple latencySeconds = new Tuple(NULL_DIMENSIONS, "latencySeconds", null);
for (Map.Entry<MetricDimensions, MetricSet> entry : snapshot) {
MetricSet metricSet = entry.getValue();
- MetricValue val = metricSet.get(ContainerMetrics.SERVER_TOTAL_SUCCESFUL_RESPONSE_LATENCY.baseName());
+ MetricValue val = metricSet.get(ContainerMetrics.SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY.baseName());
if (val instanceof GaugeMetric gauge) {
latencySeconds.add(GaugeMetric.newInstance(gauge.getLast() / 1000,
gauge.getMax() / 1000,
diff --git a/container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java b/container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java
index 327640cb7ed..2a382d22a68 100644
--- a/container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java
+++ b/container-core/src/main/java/com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java
@@ -43,7 +43,7 @@ class MetricDefinitions {
static final String NUM_SUCCESSFUL_WRITES = ContainerMetrics.SERVER_NUM_SUCCESSFUL_RESPONSE_WRITES.baseName();
static final String NUM_FAILED_WRITES = ContainerMetrics.SERVER_NUM_FAILED_RESPONSE_WRITES.baseName();
- static final String TOTAL_SUCCESSFUL_LATENCY = ContainerMetrics.SERVER_TOTAL_SUCCESFUL_RESPONSE_LATENCY.baseName();
+ static final String TOTAL_SUCCESSFUL_LATENCY = ContainerMetrics.SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY.baseName();
static final String TOTAL_FAILED_LATENCY = ContainerMetrics.SERVER_TOTAL_FAILED_RESPONSE_LATENCY.baseName();
static final String TIME_TO_FIRST_BYTE = ContainerMetrics.SERVER_TIME_TO_FIRST_BYTE.baseName();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
index b9bff5f777e..9ffbb331d0f 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.deployment;
+import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId;
@@ -13,19 +14,19 @@ import java.util.Map;
*/
public class JobMetrics {
- public static final String start = "deployment.start";
- public static final String nodeAllocationFailure = "deployment.nodeAllocationFailure";
- public static final String endpointCertificateTimeout = "deployment.endpointCertificateTimeout";
- public static final String deploymentFailure = "deployment.deploymentFailure";
- public static final String invalidApplication = "deployment.invalidApplication";
- public static final String convergenceFailure = "deployment.convergenceFailure";
- public static final String testFailure = "deployment.testFailure";
- public static final String noTests = "deployment.noTests";
- public static final String error = "deployment.error";
- public static final String abort = "deployment.abort";
- public static final String cancel = "deployment.cancel";
- public static final String success = "deployment.success";
- public static final String quotaExceeded = "deployment.quotaExceeded";
+ public static final String start = ConfigServerMetrics.DEPLOYMENT_START.baseName();
+ public static final String nodeAllocationFailure = ConfigServerMetrics.DEPLOYMENT_NODE_ALLOCATION_FAILURE.baseName();
+ public static final String endpointCertificateTimeout = ConfigServerMetrics.DEPLOYMENT_ENDPOINT_CERTIFICATE_TIMEOUT.baseName();
+ public static final String deploymentFailure = ConfigServerMetrics.DEPLOYMENT_DEPLOYMENT_FAILURE.baseName();
+ public static final String invalidApplication = ConfigServerMetrics.DEPLOYMENT_INVALID_APPLICATION.baseName();
+ public static final String convergenceFailure = ConfigServerMetrics.DEPLOYMENT_CONVERGENCE_FAILURE.baseName();
+ public static final String testFailure = ConfigServerMetrics.DEPLOYMENT_TEST_FAILURE.baseName();
+ public static final String noTests = ConfigServerMetrics.DEPLOYMENT_NO_TESTS.baseName();
+ public static final String error = ConfigServerMetrics.DEPLOYMENT_ERROR.baseName();
+ public static final String abort = ConfigServerMetrics.DEPLOYMENT_ABORT.baseName();
+ public static final String cancel = ConfigServerMetrics.DEPLOYMENT_CANCEL.baseName();
+ public static final String success = ConfigServerMetrics.DEPLOYMENT_SUCCESS.baseName();
+ public static final String quotaExceeded = ConfigServerMetrics.DEPLOYMENT_QUOTA_EXCEEDED.baseName();
private final Metric metric;
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index 71f9c37577a..96361b530e6 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
+import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.component.Version;
import com.yahoo.config.application.api.DeploymentInstanceSpec;
import com.yahoo.config.provision.ApplicationId;
@@ -49,15 +50,15 @@ import java.util.stream.Collectors;
public class MetricsReporter extends ControllerMaintainer {
public static final String TENANT_METRIC = "billing.tenants";
- public static final String DEPLOYMENT_FAIL_METRIC = "deployment.failurePercentage";
- public static final String DEPLOYMENT_AVERAGE_DURATION = "deployment.averageDuration";
- public static final String DEPLOYMENT_FAILING_UPGRADES = "deployment.failingUpgrades";
+ public static final String DEPLOYMENT_FAIL_METRIC = ConfigServerMetrics.DEPLOYMENT_FAILURE_PERCENTAGE.baseName();
+ public static final String DEPLOYMENT_AVERAGE_DURATION = ConfigServerMetrics.DEPLOYMENT_AVERAGE_DURATION.baseName();
+ public static final String DEPLOYMENT_FAILING_UPGRADES = ConfigServerMetrics.DEPLOYMENT_FAILING_UPGRADES.baseName();
public static final String DEPLOYMENT_BUILD_AGE_SECONDS = "deployment.buildAgeSeconds";
- public static final String DEPLOYMENT_WARNINGS = "deployment.warnings";
+ public static final String DEPLOYMENT_WARNINGS = ConfigServerMetrics.DEPLOYMENT_WARNINGS.baseName();
public static final String DEPLOYMENT_OVERDUE_UPGRADE = "deployment.overdueUpgradeSeconds";
- public static final String OS_CHANGE_DURATION = "deployment.osChangeDuration";
+ public static final String OS_CHANGE_DURATION = ConfigServerMetrics.DEPLOYMENT_OS_CHANGE_DURATION.baseName();
public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration";
- public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion";
+ public static final String OS_NODE_COUNT = ConfigServerMetrics.DEPLOYMENT_NODE_COUNT_BY_OS_VERSION.baseName();
public static final String PLATFORM_NODE_COUNT = "deployment.nodeCountByPlatformVersion";
public static final String BROKEN_SYSTEM_VERSION = "deployment.brokenSystemVersion";
public static final String REMAINING_ROTATIONS = "remaining_rotations";
diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
index 9936b4612c5..7a52c3384cd 100644
--- a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
+++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
@@ -27,8 +27,11 @@ public enum ConfigServerMetrics implements VespaMetrics {
MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE("maintenanceDeployment.transientFailure", Unit.OPERATION, "Number of maintenance deployments that failed with a transient failure"),
MAINTENANCE_DEPLOYMENT_FAILURE("maintenanceDeployment.failure", Unit.OPERATION, "Number of maintenance deployments that failed with a permanent failure"),
+ MAINTENANCE_SUCCESS_FACTOR_DEVIATION("maintenance.successFactorDeviation", Unit.FRACTION, "Configserver: Maintenance Success Factor Deviation"),
+ MAINTENANCE_DURATION("maintenance.duration", Unit.MILLISECOND, "Configserver: Maintenance Duration"),
+
// ZooKeeper related metrics
- ZK_CONNECTIONS_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"),
+ ZK_CONNECTION_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"),
ZK_RECONNECTED("configserver.zkReconnected", Unit.CONNECTION, "Number of ZooKeeper reconnections"),
ZK_CONNECTED("configserver.zkConnected", Unit.NODE, "Number of ZooKeeper nodes connected"),
ZK_SUSPENDED("configserver.zkSuspended", Unit.NODE, "Number of ZooKeeper nodes suspended"),
@@ -107,9 +110,75 @@ public enum ConfigServerMetrics implements VespaMetrics {
HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU("hostedVespa.docker.allocatedCapacityCpu", Unit.VCPU, "Total number of allocated VCPUs on tenant hosts managed by hosted Vespa in a zone"),
HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM("hostedVespa.docker.allocatedCapacityMem", Unit.GIGABYTE, "Total amount of allocated memory on tenant hosts managed by hosted Vespa in a zone"),
HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK("hostedVespa.docker.allocatedCapacityDisk", Unit.GIGABYTE, "Total amount of allocated disk space on tenant hosts managed by hosted Vespa in a zone"),
- HOSTED_VESPA_BREAKFIXED_HOSTS("hostedVespa.breakfixedHosts", Unit.HOST, "Number of hosts managed that are breakfixed in a zone"),
HOSTED_VESPA_PENDING_REDEPLOYMENTS("hostedVespa.pendingRedeployments", Unit.TASK, "The number of hosted Vespa re-deployments pending"),
- HOSTED_VESPA_DOCKER_SKEW("hostedVespa.docker.skew", Unit.FRACTION, "A number in the range 0..1 indicating how well allocated resources are balanced with availability on hosts");
+ HOSTED_VESPA_DOCKER_SKEW("hostedVespa.docker.skew", Unit.FRACTION, "A number in the range 0..1 indicating how well allocated resources are balanced with availability on hosts"),
+ HOSTED_VESPA_ACTIVE_HOSTS("hostedVespa.activeHosts", Unit.HOST, "The number of managed hosts that are in state \"active\""),
+ HOSTED_VESPA_BREAKFIXED_HOSTS("hostedVespa.breakfixedHosts", Unit.HOST, "The number of managed hosts that are in state \"breakfixed\""),
+ HOSTED_VESPA_DEPROVISIONED_HOSTS("hostedVespa.deprovisionedHosts", Unit.HOST, "The number of managed hosts that are in state \"deprovisioned\""),
+ HOSTED_VESPA_DIRTY_HOSTS("hostedVespa.dirtyHosts", Unit.HOST, "The number of managed hosts that are in state \"dirty\""),
+ HOSTED_VESPA_FAILED_HOSTS("hostedVespa.failedHosts", Unit.HOST, "The number of managed hosts that are in state \"failed\""),
+ HOSTED_VESPA_INACTIVE_HOSTS("hostedVespa.inactiveHosts", Unit.HOST, "The number of managed hosts that are in state \"inactive\""),
+ HOSTED_VESPA_PARKED_HOSTS("hostedVespa.parkedHosts", Unit.HOST, "The number of managed hosts that are in state \"parked\""),
+ HOSTED_VESPA_PROVISIONED_HOSTS("hostedVespa.provisionedHosts", Unit.HOST, "The number of managed hosts that are in state \"provisioned\""),
+ HOSTED_VESPA_READY_HOSTS("hostedVespa.readyHosts", Unit.HOST, "The number of managed hosts that are in state \"ready\""),
+ HOSTED_VESPA_RESERVED_HOSTS("hostedVespa.reservedHosts", Unit.HOST, "The number of managed hosts that are in state \"reserved\""),
+
+
+ OVERCOMMITTED_HOSTS("overcommittedHosts", Unit.HOST, "The number of hosts with over-committed resources"),
+ SPARE_HOST_CAPACITY("spareHostCapacity", Unit.HOST, "The number of spare hosts"),
+ THROTTLED_HOST_FAILURES("throttledHostFailures", Unit.HOST, "Number of host failures stopped due to throttling"),
+ THROTTLED_NODE_FAILURES("throttledNodeFailures", Unit.HOST, "Number of node failures stopped due to throttling"),
+ NODE_FAIL_THROTTLING("nodeFailThrottling", Unit.BINARY, "Metric indicating when node failure throttling is active. The value 1 means active, 0 means inactive"),
+
+ DEPLOYMENT_PREPARE_MILLIS("deployment.prepareMillis", Unit.MILLISECOND, "Duration of deployment preparations"),
+ DEPLOYMENT_ACTIVATE_MILLIS("deployment.activateMillis", Unit.MILLISECOND, "Duration of deployment activations"),
+
+
+ // Controller specific metrics
+ // TODO: Separate out to separate class, don't document
+ DEPLOYMENT_START("deployment.start", Unit.DEPLOYMENT, "The number of started deployment jobs"),
+ DEPLOYMENT_NODE_ALLOCATION_FAILURE("deployment.nodeAllocationFailure", Unit.DEPLOYMENT, "The number of deployments failed due to node allocation failures"),
+ DEPLOYMENT_ENDPOINT_CERTIFICATE_TIMEOUT("deployment.endpointCertificateTimeout", Unit.DEPLOYMENT, "The number of deployments failed due to timeout acquiring endpoint certificate"),
+ DEPLOYMENT_DEPLOYMENT_FAILURE("deployment.deploymentFailure", Unit.DEPLOYMENT, "The number of deployments that failed"),
+ DEPLOYMENT_INVALID_APPLICATION("deployment.invalidApplication", Unit.DEPLOYMENT, "Deployments with invalid application package"),
+ DEPLOYMENT_CONVERGENCE_FAILURE("deployment.convergenceFailure", Unit.DEPLOYMENT, "The number of deployments with convergence failure"),
+ DEPLOYMENT_TEST_FAILURE("deployment.testFailure", Unit.DEPLOYMENT, "The number of test deployments with test failure"),
+ DEPLOYMENT_NO_TESTS("deployment.noTests", Unit.DEPLOYMENT, "Deployments with no tests"),
+ DEPLOYMENT_ERROR("deployment.error", Unit.DEPLOYMENT, "Deployments with error"),
+ DEPLOYMENT_ABORT("deployment.abort", Unit.DEPLOYMENT, "Deployments that were aborted"),
+ DEPLOYMENT_CANCEL("deployment.cancel", Unit.DEPLOYMENT, "Deployments that were canceled"),
+ DEPLOYMENT_SUCCESS("deployment.success", Unit.DEPLOYMENT, "Successful deployments"),
+ DEPLOYMENT_QUOTA_EXCEEDED("deployment.quotaExceeded", Unit.DEPLOYMENT, "Deployments stopped due to exceeding quota"),
+
+
+ BILLING_TENANTS("billing.tenants", Unit.TENANT, "Billing tenants"),
+ DEPLOYMENT_FAILURE_PERCENTAGE("deployment.failurePercentage", Unit.PERCENTAGE, "Deployment: Failure percentage"),
+ DEPLOYMENT_AVERAGE_DURATION("deployment.averageDuration", Unit.SECOND, "Deployment duration"),
+ DEPLOYMENT_FAILING_UPGRADES("deployment.failingUpgrades", Unit.DEPLOYMENT, "Deployment: Failing upgrades"),
+ DEPLOYMENT_BUILDING_AGE_SECONDS("deployment.buildAgeSeconds", Unit.SECOND, "Deployment: Build age"),
+ DEPLOYMENT_WARNINGS("deployment.warnings", Unit.ITEM, "The number of application related warnings during deployments"),
+ DEPLOYMENT_OVERDUE_UPGRADE_SECONDS("deployment.overdueUpgradeSeconds", Unit.SECOND, "Deployment: Overdue upgrade period"),
+ DEPLOYMENT_OS_CHANGE_DURATION("deployment.osChangeDuration", Unit.SECOND, "Deployment: OS change duration"),
+ DEPLOYMENT_PLATFORM_CHANGE_DURATION("deployment.platformChangeDuration", Unit.SECOND, "Deployment: Platform change duration"),
+ DEPLOYMENT_NODE_COUNT_BY_OS_VERSION("deployment.nodeCountByOsVersion", Unit.NODE, "Deployment: Node count by OS version"),
+ DEPLOYMENT_NODE_COUNT_BY_PLATFORM_VERSION("deployment.nodeCountByPlatformVersion", Unit.NODE, "Deployment: Node count by platform version"),
+ DEPLOYMENT_BROKEN_SYSTEM_VERSION("deployment.brokenSystemVersion", Unit.BINARY, "Deployment: Value 1 for broken system versions, 0 if not"),
+ REMAINING_ROTATIONS("remaining_rotations", Unit.ROTATION, "Remaining rotations"),
+ DNS_QUEUED_REQUESTS("dns.queuedRequests", Unit.REQUEST, "Queued DNS requests"),
+ ZMS_QUOTA_USAGE("zms.quota.usage", Unit.FRACTION, "ZMS Quota usage per resource type"),
+
+
+ // Metrics per API, metrics created in ControllerMaintainer/MetricsReporter
+ OPERATION_API("operation.api.last", Unit.REQUEST, "Controller: Requests for /api API"),
+ OPERATION_APPLICATION("operation.application", Unit.REQUEST, "Controller: Requests for /application API"),
+ OPERATION_CONFIGSERVER("operation.configserver", Unit.REQUEST, "Controller: Requests for /configserver API"),
+ OPERATION_CONTROLLER("operation.controller", Unit.REQUEST, "Controller: Requests for /controller API"),
+ OPERATION_FLAGS("operation.flags", Unit.REQUEST, "Controller: Requests for /flags API"),
+ OPERATION_OS("operation.os", Unit.REQUEST, "Controller: Requests for /os API"),
+ OPERATION_ROUTING("operation.routing", Unit.REQUEST, "Controller: Requests for /routing API"),
+ OPERATION_ZONE("operation.zone", Unit.REQUEST, "Controller: Requests for /zone API");
+
+
private final String name;
private final Unit unit;
diff --git a/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java
index ab3fb9b6197..98bd6230762 100644
--- a/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java
+++ b/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java
@@ -196,7 +196,7 @@ public enum ContainerMetrics implements VespaMetrics {
SERVER_NUM_SUCCESSFUL_RESPONSE_WRITES("serverNumSuccessfulResponseWrites", Unit.REQUEST, "Number of successful response writes"),
SERVER_NUM_FAILED_RESPONSE_WRITES("serverNumFailedResponseWrites", Unit.REQUEST, "Number of failed response writes"),
- SERVER_TOTAL_SUCCESFUL_RESPONSE_LATENCY("serverTotalSuccessfulResponseLatency", Unit.MILLISECOND, "Total duration for execution of successful responses"),
+ SERVER_TOTAL_SUCCESSFUL_RESPONSE_LATENCY("serverTotalSuccessfulResponseLatency", Unit.MILLISECOND, "Total duration for execution of successful responses"),
SERVER_TOTAL_FAILED_RESPONSE_LATENCY("serverTotalFailedResponseLatency", Unit.MILLISECOND, "Total duration for execution of failed responses"),
SERVER_TIME_TO_FIRST_BYTE("serverTimeToFirstByte", Unit.MILLISECOND, "Time from request has been received by the server until the first byte is returned to the client"),
diff --git a/metrics/src/main/java/ai/vespa/metrics/Unit.java b/metrics/src/main/java/ai/vespa/metrics/Unit.java
index d514b9e9839..3a438efde06 100644
--- a/metrics/src/main/java/ai/vespa/metrics/Unit.java
+++ b/metrics/src/main/java/ai/vespa/metrics/Unit.java
@@ -10,6 +10,7 @@ public enum Unit {
BYTE(BaseUnit.BYTE, "A collection of 8 bits"),
BYTE_PER_SECOND(BaseUnit.BYTE, BaseUnit.SECOND, "A unit of storage capable of holding 8 bits"),
CONNECTION(BaseUnit.CONNECTION, "A link used for communication between a client and a server"),
+ DEPLOYMENT(BaseUnit.DEPLOYMENT, "A deployment on hosted Vespa"),
DOCUMENT(BaseUnit.DOCUMENT, "Vespa document, a collection of fields defined in a schema file"),
DOCUMENTID(BaseUnit.DOCUMENTID, "A unique document identifier"),
DOLLAR_PER_HOUR(BaseUnit.DOLLAR, BaseUnit.HOUR, "Total current cost of the cluster in $/hr"),
@@ -36,10 +37,12 @@ public enum Unit {
REQUEST(BaseUnit.REQUEST, "A request sent from a client to a server"),
RESPONSE(BaseUnit.RESPONSE, "A response from a server to a client, typically as a response to a request"),
RESTART(BaseUnit.RESTART, "A service or node restarts"),
+ ROTATION(BaseUnit.ROTATION, "Routing rotation"),
SCORE(BaseUnit.SCORE, "Relevance score for a document"),
SECOND(BaseUnit.SECOND, "Time span of 1 second"),
SESSION(BaseUnit.SESSION, "A set of operations taking place during one connection or as part of a higher level operation"),
TASK(BaseUnit.TASK, "Piece of work executed by a server, e.g. to perform back-ground data maintenance"),
+ TENANT(BaseUnit.TENANT, "Tenant that owns zero or more applications in a managed Vespa system"),
THREAD(BaseUnit.THREAD, "Computer thread for executing e.g. tasks, operations or queries"),
VCPU(BaseUnit.VCPU,"Virtual CPU"),
@@ -84,6 +87,7 @@ public enum Unit {
BUCKET("bucket"),
BYTE("byte"),
CONNECTION("connection"),
+ DEPLOYMENT("deployment"),
DOCUMENT("document"),
DOCUMENTID("documentid"),
DOLLAR("dollar"),
@@ -108,10 +112,12 @@ public enum Unit {
REQUEST("request"),
RESPONSE("response"),
RESTART("restart"),
+ ROTATION("routing rotation"),
SCORE("score"),
SECOND("second", "s"),
SESSION("session"),
TASK("task"),
+ TENANT("tenant"),
THREAD("thread"),
VCPU("vcpu"),
VERSION("version"),
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 585a7f341b5..5673b2d74ea 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
@@ -45,10 +46,10 @@ public class NodeFailer extends NodeRepositoryMaintainer {
private static final Logger log = Logger.getLogger(NodeFailer.class.getName());
/** Metric for number of hosts that we want to fail, but cannot due to throttling */
- static final String throttledHostFailuresMetric = "throttledHostFailures";
+ static final String throttledHostFailuresMetric = ConfigServerMetrics.THROTTLED_HOST_FAILURES.baseName();
/** Metric for number of nodes that we want to fail, but cannot due to throttling */
- static final String throttledNodeFailuresMetric = "throttledNodeFailures";
+ static final String throttledNodeFailuresMetric = ConfigServerMetrics.THROTTLED_NODE_FAILURES.baseName();
/** Metric that indicates whether throttling is active where 1 means active and 0 means inactive */
static final String throttlingActiveMetric = "nodeFailThrottling";
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
index dcdcbf09175..da05656fcee 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Deployer;
@@ -75,7 +76,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
CapacityChecker capacityChecker = new CapacityChecker(allNodes);
List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts();
- metric.set("overcommittedHosts", overcommittedHosts.size(), null);
+ metric.set(ConfigServerMetrics.OVERCOMMITTED_HOSTS.baseName(), overcommittedHosts.size(), null);
retireOvercommitedHosts(allNodes, overcommittedHosts);
boolean success = true;
@@ -93,7 +94,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
success = false;
}
}
- metric.set("spareHostCapacity", spareHostCapacity, null);
+ metric.set(ConfigServerMetrics.SPARE_HOST_CAPACITY.baseName(), spareHostCapacity, null);
}
return success ? 1.0 : 0.0;
}