summaryrefslogtreecommitdiffstats
path: root/metrics
diff options
context:
space:
mode:
authorgjoranv <gv@yahooinc.com>2023-05-08 12:44:34 +0200
committergjoranv <gv@yahooinc.com>2023-05-08 12:44:34 +0200
commitbe2ad5ff2b54bb8fdf145682005052fe26e1547d (patch)
treee86933d7cc037d11fdfedc3fb5a325f4d99357f1 /metrics
parent0c97250ee7c1bab4d3af448c8fea5e5ccdb9638a (diff)
Move metrics definitions to metrics:ai.vespa.metrics
Diffstat (limited to 'metrics')
-rw-r--r--metrics/pom.xml6
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/ClusterControllerMetrics.java53
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java60
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java228
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/DistributorMetrics.java257
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/HostedNodeAdminMetrics.java61
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/LogdMetrics.java33
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/NodeAdminMetrics.java35
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/RoutingLayerMetrics.java34
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/SearchNodeMetrics.java299
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/SentinelMetrics.java36
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/SlobrokMetrics.java37
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java253
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/Suffix.java24
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/Unit.java129
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/VespaMetrics.java56
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/package-info.java5
17 files changed, 1606 insertions, 0 deletions
diff --git a/metrics/pom.xml b/metrics/pom.xml
index f44a2569ace..e8303e5a01f 100644
--- a/metrics/pom.xml
+++ b/metrics/pom.xml
@@ -15,6 +15,12 @@
<dependencies>
<dependency>
<groupId>com.yahoo.vespa</groupId>
+ <artifactId>annotations</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
<artifactId>config-lib</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
diff --git a/metrics/src/main/java/ai/vespa/metrics/ClusterControllerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ClusterControllerMetrics.java
new file mode 100644
index 00000000000..545de543663
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/ClusterControllerMetrics.java
@@ -0,0 +1,53 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngve
+ */
+public enum ClusterControllerMetrics implements VespaMetrics {
+
+ DOWN_COUNT("cluster-controller.down.count", Unit.NODE, "Number of content nodes down"),
+ INITIALIZING_COUNT("cluster-controller.initializing.count", Unit.NODE, "Number of content nodes initializing"),
+ MAINTENANCE_COUNT("cluster-controller.maintenance.count", Unit.NODE, "Number of content nodes in maintenance"),
+ RETIRED_COUNT("cluster-controller.retired.count", Unit.NODE, "Number of content nodes that are retired"),
+ STOPPING_COUNT("cluster-controller.stopping.count", Unit.NODE, "Number of content nodes currently stopping"),
+ UP_COUNT("cluster-controller.up.count", Unit.NODE, "Number of content nodes up"),
+ CLUSTER_STATE_CHANGE_COUNT("cluster-controller.cluster-state-change.count", Unit.NODE, "Number of nodes changing state"),
+ BUSY_TICK_TIME_MS("cluster-controller.busy-tick-time-ms", Unit.MILLISECOND, "Time busy"),
+ IDLE_TICK_TIME_MS("cluster-controller.idle-tick-time-ms", Unit.MILLISECOND, "Time idle"),
+ WORK_MS("cluster-controller.work-ms", Unit.MILLISECOND, "Time used for actual work"),
+ IS_MASTER("cluster-controller.is-master", Unit.BINARY, "1 if this cluster controller is currently the master, or 0 if not"),
+ REMOTE_TASK_QUEUE_SIZE("cluster-controller.remote-task-queue.size", Unit.OPERATION, "Number of remote tasks queued"),
+ // TODO(hakonhall): Update this name once persistent "count" metrics has been implemented.
+ // DO NOT RELY ON THIS METRIC YET.
+ NODE_EVENT_COUNT("cluster-controller.node-event.count", Unit.OPERATION, "Number of node events"),
+ RESOURCE_USAGE_NODES_ABOVE_LIMIT("cluster-controller.resource_usage.nodes_above_limit", Unit.NODE, "The number of content nodes above resource limit, blocking feed"),
+ RESOURCE_USAGE_MAX_MEMORY_UTILIZATION("cluster-controller.resource_usage.max_memory_utilization", Unit.FRACTION, "Current memory utilisation, per content node"),
+ RESOURCE_USAGE_MAX_DISK_UTILIZATION("cluster-controller.resource_usage.max_disk_utilization", Unit.FRACTION, "Current disk space utilisation, per content node"),
+ RESOURCE_USAGE_MEMORY_LIMIT("cluster-controller.resource_usage.memory_limit", Unit.FRACTION, "Disk space limit as a fraction of available disk space"),
+ RESOURCE_USAGE_DISK_LIMIT("cluster-controller.resource_usage.disk_limit", Unit.FRACTION, "Memory space limit as a fraction of available memory"),
+ REINDEXING_PROGRESS("reindexing.progress", Unit.FRACTION, "Re-indexing progress");
+
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ ClusterControllerMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
new file mode 100644
index 00000000000..dba75c08aae
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java
@@ -0,0 +1,60 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngveaasheim
+ */
+public enum ConfigServerMetrics implements VespaMetrics {
+
+ REQUESTS("configserver.requests", Unit.REQUEST, "Number of requests processed"),
+ FAILED_REQUESTS("configserver.failedRequests", Unit.REQUEST, "Number of requests that failed"),
+ LATENCY("configserver.latency", Unit.MILLISECOND, "Time to complete requests"),
+ CACHE_CONFIG_ELEMS("configserver.cacheConfigElems", Unit.ITEM, "Time to complete requests"),
+ CACHE_CHECKSUM_ELEMS("configserver.cacheChecksumElems", Unit.ITEM, "Number of checksum elements in the cache"),
+ HOSTS("configserver.hosts", Unit.NODE, "The number of nodes being served configuration from the config server cluster"),
+ TENANTS("configserver.tenants", Unit.INSTANCE, "The number of tenants being served configuration from the config server cluster"),
+ APPLICATIONS("configserver.applications", Unit.INSTANCE, "The number of applications being served configuration from the config server cluster"),
+ DELAYED_RESPONSES("configserver.delayedResponses", Unit.RESPONSE, "Number of delayed responses"),
+ SESSION_CHANGE_ERRORS("configserver.sessionChangeErrors", Unit.SESSION, "Number of session change errors"),
+ UNKNOWN_HOST_REQUEST("configserver.unknownHostRequests", Unit.REQUEST, "Config requests from unknown hosts"),
+ NEW_SESSIONS("configserver.newSessions", Unit.SESSION, "New config sessions"),
+ PREPARED_SESSIONS("configserver.preparedSessions", Unit.SESSION, "Prepared config sessions"),
+ ACTIVE_SESSIONS("configserver.activeSessions", Unit.SESSION, "Active config sessions"),
+ INACTIVE_SESSIONS("configserver.inactiveSessions", Unit.SESSION, "Inactive config sessions"),
+ ADDED_SESSIONS("configserver.addedSessions", Unit.SESSION, "Added config sessions"),
+ REMOVED_SESSIONS("configserver.removedSessions", Unit.SESSION, "Removed config sessions"),
+ RPC_SERVER_WORK_QUEUE_SIZE("configserver.rpcServerWorkQueueSize", Unit.ITEM, "Number of elements in the RPC server work queue"),
+
+ // ZooKeeper related metrics
+ ZK_CONNECTIONS_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"),
+ ZK_RECONNECTED("configserver.zkReconnected", Unit.CONNECTION, "Number of ZooKeeper reconnections"),
+ ZK_CONNECTED("configserver.zkConnected", Unit.NODE, "Number of ZooKeeper nodes connected"),
+ ZK_SUSPENDED("configserver.zkSuspended", Unit.NODE, "Number of ZooKeeper nodes suspended"),
+ ZK_Z_NODES("configserver.zkZNodes", Unit.NODE, "Number of ZooKeeper nodes present"),
+ ZK_AVG_LATENCY("configserver.zkAvgLatency", Unit.MILLISECOND, "Average latency for ZooKeeper requests"), // TODO: Confirm metric name
+ ZK_MAX_LATENCY("configserver.zkMaxLatency", Unit.MILLISECOND, "Max latency for ZooKeeper requests"),
+ ZK_CONNECTIONS("configserver.zkConnections", Unit.CONNECTION, "Number of ZooKeeper connections"),
+ ZK_OUTSTANDING_REQUESTS("configserver.zkOutstandingRequests", Unit.REQUEST, "Number of ZooKeeper requests in flight");
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ ConfigServerMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java
new file mode 100644
index 00000000000..84de1878b3f
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java
@@ -0,0 +1,228 @@
+package ai.vespa.metrics;
+
+/**
+ * @author gjoranv
+ */
+public enum ContainerMetrics implements VespaMetrics {
+
+ HTTP_STATUS_1XX("http.status.1xx", Unit.RESPONSE, "Number of responses with a 1xx status"),
+ HTTP_STATUS_2XX("http.status.2xx", Unit.RESPONSE, "Number of responses with a 2xx status"),
+ HTTP_STATUS_3XX("http.status.3xx", Unit.RESPONSE, "Number of responses with a 3xx status"),
+ HTTP_STATUS_4XX("http.status.4xx", Unit.RESPONSE, "Number of responses with a 4xx status"),
+ HTTP_STATUS_5XX("http.status.5xx", Unit.RESPONSE, "Number of responses with a 5xx status"),
+
+ APPLICATION_GENERATION("application_generation", Unit.VERSION, "The currently live application config generation (aka session id)"),
+ IN_SERVICE("in_service", Unit.BINARY, "This will have the value 1 if the node is in service, 0 if not."),
+
+ JDISC_GC_COUNT("jdisc.gc.count", Unit.OPERATION, "Number of JVM garbage collections done"),
+ JDISC_GC_MS("jdisc.gc.ms", Unit.MILLISECOND, "Time spent in JVM garbage collection"),
+ JDISC_JVM("jdisc.jvm", Unit.VERSION, "JVM runtime version"),
+ JDISC_MEMORY_MAPPINGS("jdisc.memory_mappings", Unit.OPERATION, "JDISC Memory mappings"),
+ JDISC_OPEN_FILE_DESCRIPTORS("jdisc.open_file_descriptors", Unit.ITEM, "JDISC Open file descriptors"),
+
+ JDISC_THREAD_POOL_UNHANDLED_EXCEPTIONS("jdisc.thread_pool.unhandled_exceptions", Unit.THREAD, "Number of exceptions thrown by tasks"),
+ JDISC_THREAD_POOL_WORK_QUEUE_CAPACITY("jdisc.thread_pool.work_queue.capacity", Unit.THREAD, "Capacity of the task queue"),
+ JDISC_THREAD_POOL_WORK_QUEUE_SIZE("jdisc.thread_pool.work_queue.size", Unit.THREAD, "Size of the task queue"),
+ JDISC_THREAD_POOL_REJECTED_TASKS("jdisc.thread_pool.rejected_tasks", Unit.THREAD, "Number of tasks rejected by the thread pool"),
+ JDISC_THREAD_POOL_SIZE("jdisc.thread_pool.size", Unit.THREAD, "Size of the thread pool"),
+ JDISC_THREAD_POOL_MAX_ALLOWED_SIZE("jdisc.thread_pool.max_allowed_size", Unit.THREAD, "The maximum allowed number of threads in the pool"),
+ JDISC_THREAD_POOL_ACTIVE_THREADS("jdisc.thread_pool.active_threads", Unit.THREAD, "Number of threads that are active"),
+
+ JDISC_DEACTIVATED_CONTAINERS("jdisc.deactivated_containers.total", Unit.ITEM, "JDISC Deactivated container instances"),
+ JDISC_DEACTIVATED_CONTAINERS_WITH_RETAINED_REFS("jdisc.deactivated_containers.with_retained_refs.last", Unit.ITEM, "JDISC Deactivated container nodes with retained refs"),
+ JDISC_APPLICATION_FAILED_COMPONENT_GRAPHS("jdisc.application.failed_component_graphs", Unit.ITEM, "JDISC Application failed component graphs"),
+ JDISC_APPLICATION_COMPONENT_GRAPH_CREATION_TIME_MILLIS("jdisc.application.component_graph.creation_time_millis", Unit.MILLISECOND, "JDISC Application component graph creation time"),
+ JDISC_APPLICATION_COMPONENT_GRAPH_RECONFIGURATIONS("jdisc.application.component_graph.reconfigurations", Unit.ITEM, "JDISC Application component graph reconfigurations"),
+
+ JDISC_SINGLETON_IS_ACTIVE("jdisc.singleton.is_active", Unit.ITEM, "JDISC Singleton is active"),
+ JDISC_SINGLETON_ACTIVATION_COUNT("jdisc.singleton.activation.count", Unit.OPERATION, "JDISC Singleton activations"),
+ JDISC_SINGLETON_ACTIVATION_FAILURE_COUNT("jdisc.singleton.activation.failure.count", Unit.OPERATION, "JDISC Singleton activation failures"),
+ JDISC_SINGLETON_ACTIVATION_MILLIS("jdisc.singleton.activation.millis", Unit.MILLISECOND, "JDISC Singleton activation time"),
+ JDISC_SINGLETON_DEACTIVATION_COUNT("jdisc.singleton.deactivation.count", Unit.OPERATION, "JDISC Singleton deactivations"),
+ JDISC_SINGLETON_DEACTIVATION_FAILURE_COUNT("jdisc.singleton.deactivation.failure.count", Unit.OPERATION, "JDISC Singleton deactivation failures"),
+ JDISC_SINGLETON_DEACTIVATION_MILLIS("jdisc.singleton.deactivation.millis", Unit.MILLISECOND, "JDISC Singleton deactivation time"),
+
+ JDISC_HTTP_SSL_HANDSHAKE_FAILURE_MISSING_CLIENT_CERT("jdisc.http.ssl.handshake.failure.missing_client_cert", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to missing client certificate"),
+ JDISC_HTTP_SSL_HANDSHAKE_FAILURE_EXPIRED_CLIENT_CERT("jdisc.http.ssl.handshake.failure.expired_client_cert", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to expired client certificate"),
+ JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INVALID_CLIENT_CERT("jdisc.http.ssl.handshake.failure.invalid_client_cert", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to invalid client certificate"),
+ JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INCOMPATIBLE_PROTOCOLS("jdisc.http.ssl.handshake.failure.incompatible_protocols", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to inincompatible protocols"),
+ JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INCOMPATIBLE_CHIFERS("jdisc.http.ssl.handshake.failure.incompatible_chifers", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to incompatible chifers"),
+ JDISC_HTTP_SSL_HANDSHAKE_FAILURE_CONNECTION_CLOSED("jdisc.http.ssl.handshake.failure.connection_closed", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to connection closed"),
+ JDISC_HTTP_SSL_HANDSHAKE_FAILURE_UNKNOWN("jdisc.http.ssl.handshake.failure.unknown", Unit.OPERATION, "JDISC HTTP SSL Handshake failures for unknown reason"),
+
+ JDISC_HTTP_REQUEST_PREMATURELY_CLOSED("jdisc.http.request.prematurely_closed", Unit.REQUEST, "HTTP requests prematurely closed"),
+ JDISC_HTTP_REQUEST_REQUESTS_PER_CONNECTION("jdisc.http.request.requests_per_connection", Unit.REQUEST, "HTTP requests per connection"),
+ JDISC_HTTP_REQUEST_URI_LENGTH("jdisc.http.request.uri_length", Unit.BYTE, "HTTP URI length"),
+ JDISC_HTTP_REQUEST_CONTENT_SIZE("jdisc.http.request.content_size", Unit.BYTE, "HTTP request content size"),
+ JDISC_HTTP_REQUESTS("jdisc.http.requests", Unit.REQUEST, "HTTP requests"),
+ JDISC_HTTP_REQUESTS_STATUS("jdisc.http.requests.status", Unit.REQUEST, "Number of requests to the built-in status handler"),
+
+ JDISC_HTTP_FILTER_RULE_BLOCKED_REQUESTS("jdisc.http.filter.rule.blocked_requests", Unit.REQUEST, "Number of requests blocked by filter"),
+ JDISC_HTTP_FILTER_RULE_ALLOWED_REQUESTS("jdisc.http.filter.rule.allowed_requests", Unit.REQUEST, "Number of requests allowed by filter"),
+ JDISC_HTTP_FILTERING_REQUEST_HANDLED("jdisc.http.filtering.request.handled", Unit.REQUEST, "Number of filtering requests handled"),
+ JDISC_HTTP_FILTERING_REQUEST_UNHANDLED("jdisc.http.filtering.request.unhandled", Unit.REQUEST, "Number of filtering requests unhandled"),
+ JDISC_HTTP_FILTERING_RESPONSE_HANDLED("jdisc.http.filtering.response.handled", Unit.REQUEST, "Number of filtering responses handled"),
+ JDISC_HTTP_FILTERING_RESPONSE_UNHANDLED("jdisc.http.filtering.response.unhandled", Unit.REQUEST, "Number of filtering responses unhandled"),
+ JDISC_HTTP_HANDLER_UNHANDLED_EXCEPTIONS("jdisc.http.handler.unhandled_exceptions", Unit.REQUEST, "Number of unhandled exceptions in handler"),
+
+ JDISC_TLS_CAPABILITY_CHECKS_SUCCEEDED("jdisc.tls.capability_checks.succeeded", Unit.OPERATION, "Number of TLS capability checks succeeded"),
+ JDISC_TLS_CAPABILITY_CHECKS_FAILED("jdisc.tls.capability_checks.failed", Unit.OPERATION, "Number of TLS capability checks failed"),
+
+ JETTY_THREADPOOL_MAX_THREADS("jdisc.http.jetty.threadpool.thread.max", Unit.THREAD, "Configured maximum number of threads"),
+ JETTY_THREADPOOL_MIN_THREADS("jdisc.http.jetty.threadpool.thread.min", Unit.THREAD, "Configured minimum number of threads"),
+ JETTY_THREADPOOL_RESERVED_THREADS("jdisc.http.jetty.threadpool.thread.reserved", Unit.THREAD, "Configured number of reserved threads or -1 for heuristic"),
+ JETTY_THREADPOOL_BUSY_THREADS("jdisc.http.jetty.threadpool.thread.busy", Unit.THREAD, "Number of threads executing internal and transient jobs"),
+ JETTY_THREADPOOL_IDLE_THREADS("jdisc.http.jetty.threadpool.thread.idle", Unit.THREAD, "Number of idle threads"),
+ JETTY_THREADPOOL_TOTAL_THREADS("jdisc.http.jetty.threadpool.thread.total", Unit.THREAD, "Current number of threads"),
+ JETTY_THREADPOOL_QUEUE_SIZE("jdisc.http.jetty.threadpool.queue.size", Unit.THREAD, "Current size of the job queue"),
+
+ SERVER_NUM_OPEN_CONNECTIONS("serverNumOpenConnections", Unit.CONNECTION, "The number of currently open connections"),
+ SERVER_NUM_CONNECTIONS("serverNumConnections", Unit.CONNECTION, "The total number of connections opened"),
+
+ SERVER_BYTES_RECEIVED("serverBytesReceived", Unit.BYTE, "The number of bytes received by the server"),
+ SERVER_BYTES_SENT("serverBytesSent", Unit.BYTE, "The number of bytes sent from the server"),
+
+ HANDLED_REQUESTS("handled.requests", Unit.OPERATION, "The number of requests handled per metrics snapshot"),
+ HANDLED_LATENCY("handled.latency", Unit.MILLISECOND, "The time used for requests during this metrics snapshot"),
+
+ HTTPAPI_LATENCY("httpapi_latency", Unit.MILLISECOND, "Duration for requests to the HTTP document APIs"),
+ HTTPAPI_PENDING("httpapi_pending", Unit.OPERATION, "Document operations pending execution"),
+ HTTPAPI_NUM_OPERATIONS("httpapi_num_operations", Unit.OPERATION, "Total number of document operations performed"),
+ HTTPAPI_NUM_UPDATES("httpapi_num_updates", Unit.OPERATION, "Document update operations performed"),
+ HTTPAPI_NUM_REMOVES("httpapi_num_removes", Unit.OPERATION, "Document remove operations performed"),
+ HTTPAPI_NUM_PUTS("httpapi_num_puts", Unit.OPERATION, "Document put operations performed"),
+ HTTPAPI_SUCCEEDED("httpapi_succeeded", Unit.OPERATION, "Document operations that succeeded"),
+ HTTPAPI_FAILED("httpapi_failed", Unit.OPERATION, "Document operations that failed"),
+ HTTPAPI_PARSE_ERROR("httpapi_parse_error", Unit.OPERATION, "Document operations that failed due to document parse errors"),
+ HTTPAPI_CONDITION_NOT_MET("httpapi_condition_not_met", Unit.OPERATION, "Document operations not applied due to condition not met"),
+ HTTPAPI_NOT_FOUND("httpapi_not_found", Unit.OPERATION, "Document operations not applied due to document not found"),
+ HTTPAPI_FAILED_UNKNOWN("httpapi_failed_unknown", Unit.OPERATION, "Document operations failed by unknown cause"),
+ HTTPAPI_FAILED_TIMEOUT("httpapi_failed_timeout", Unit.OPERATION, "Document operations failed by timeout"),
+ HTTPAPI_FAILED_INSUFFICIENT_STORAGE("httpapi_failed_insufficient_storage", Unit.OPERATION, "Document operations failed by insufficient storage"),
+
+ MEM_HEAP_TOTAL("mem.heap.total", Unit.BYTE, "Total available heap memory"),
+ MEM_HEAP_FREE("mem.heap.free", Unit.BYTE, "Free heap memory"),
+ MEM_HEAP_USED("mem.heap.used", Unit.BYTE, "Currently used heap memory"),
+ MEM_DIRECT_TOTAL("mem.direct.total", Unit.BYTE, "Total available direct memory"),
+ MEM_DIRECT_FREE("mem.direct.free", Unit.BYTE, "Currently free direct memory"),
+ MEM_DIRECT_USED("mem.direct.used", Unit.BYTE, "Direct memory currently used"),
+ MEM_DIRECT_COUNT("mem.direct.count", Unit.BYTE, "Number of direct memory allocations"),
+ MEM_NATIVE_TOTAL("mem.native.total", Unit.BYTE, "Total available native memory"),
+ MEM_NATIVE_FREE("mem.native.free", Unit.BYTE, "Currently free native memory"),
+ MEM_NATIVE_USED("mem.native.used", Unit.BYTE, "Native memory currently used"),
+
+ ATHENZ_TENANT_CERT_EXPIRY_SECONDS("athenz-tenant-cert.expiry.seconds", Unit.SECOND, "Time remaining until Athenz tenant certificate expires"),
+ CONTAINER_IAM_ROLE_EXPIRY_SECONDS("container-iam-role.expiry.seconds", Unit.SECOND, "Time remaining until IAM role expires"),
+
+
+ // SearchChain metrics
+ PEAK_QPS("peak_qps", Unit.QUERY_PER_SECOND, "The highest number of qps for a second for this metrics shapshot"),
+ SEARCH_CONNECTIONS("search_connections", Unit.CONNECTION, "Number of search connections"),
+ FEED_OPERATIONS("feed.operations", Unit.OPERATION, "Number of document feed operations"),
+ FEED_LATENCY("feed.latency", Unit.MILLISECOND, "Feed latency"),
+ FEED_HTTP_REQUESTS("feed.http-requests", Unit.OPERATION, "Feed HTTP requests"),
+ QUERIES("queries", Unit.OPERATION, "Query volume"),
+ QUERY_CONTAINER_LATENCY("query_container_latency", Unit.MILLISECOND, "The query execution time consumed in the container"),
+ QUERY_LATENCY("query_latency", Unit.MILLISECOND, "The overall query latency as seen by the container"),
+ QUERY_TIMEOUT("query_timeout", Unit.MILLISECOND, "The amount of time allowed for query execytion, from the client"),
+ FAILED_QUERIES("failed_queries", Unit.OPERATION, "The number of failed queries"),
+ DEGRADED_QUERIES("degraded_queries", Unit.OPERATION, "The number of degraded queries, e.g. due to some conent nodes not responding in time"),
+ HITS_PER_QUERY("hits_per_query", Unit.HIT_PER_QUERY, "The number of hits returned"),
+ QUERY_HIT_OFFSET("query_hit_offset", Unit.HIT, "The offset for hits returned"),
+ DOCUMENTS_COVERED("documents_covered", Unit.DOCUMENT, "The combined number of documents considered during query evaluation"),
+ DOCUMENTS_TOTAL("documents_total", Unit.DOCUMENT, "The number of documents to be evaluated if all requests had been fully executed"),
+ DOCUMENTS_TARGET_TOTAL("documents_target_total", Unit.DOCUMENT, "The target number of total documents to be evaluated when when all data is in sync"),
+ JDISC_RENDER_LATENCY("jdisc.render.latency", Unit.NANOSECOND, "The time used by the container to render responses"),
+ QUERY_ITEM_COUNT("query_item_count", Unit.ITEM, "The number of query items (terms, phrases, etc)"),
+ DOCPROC_PROC_TIME("docproc.proctime", Unit.MILLISECOND, "Time spent processing document"),
+ DOCPROC_DOCUMENTS("docproc.documents", Unit.DOCUMENT, "Number of processed documents"),
+
+ TOTAL_HITS_PER_QUERY("totalhits_per_query", Unit.HIT_PER_QUERY, "The total number of documents found to match queries"),
+ EMPTY_RESULTS("empty_results", Unit.OPERATION, "Number of queries matching no documents"),
+ REQUESTS_OVER_QUOTA("requestsOverQuota", Unit.OPERATION, "The number of requests rejected due to exceeding quota"),
+
+ RELEVANCE_AT_1("relevance.at_1", Unit.SCORE, "The relevance of hit number 1"),
+ RELEVANCE_AT_3("relevance.at_3", Unit.SCORE, "The relevance of hit number 3"),
+ RELEVANCE_AT_10("relevance.at_10", Unit.SCORE, "The relevance of hit number 10"),
+
+ // Errors from search container
+ ERROR_TIMEOUT("error.timeout", Unit.OPERATION, "Requests that timed out"),
+ ERROR_BACKENDS_OOS("error.backends_oos", Unit.OPERATION, "Requests that failed due to no available backends nodes"),
+ ERROR_PLUGIN_FAILURE("error.plugin_failure", Unit.OPERATION, "Requests that failed due to plugin failure"),
+ ERROR_BACKEND_COMMUNICATION_ERROR("error.backend_communication_error", Unit.OPERATION, "Requests that failed due to backend communication error"),
+ ERROR_EMPTY_DOCUMENT_SUMMARIES("error.empty_document_summaries", Unit.OPERATION, "Requests that failed due to missing document summaries"),
+ ERROR_ILLEGAL_QUERY("error.illegal_query", Unit.OPERATION, "Requests that failed due to illegal queries"),
+ ERROR_INVALID_QUERY_PARAMETER("error.invalid_query_parameter", Unit.OPERATION, "Requests that failed due to invalid query parameters"),
+ ERROR_INTERNAL_SERVER_ERROR("error.internal_server_error", Unit.OPERATION, "Requests that failed due to internal server error"),
+ ERROR_MISCONFIGURED_SERVER("error.misconfigured_server", Unit.OPERATION, "Requests that failed due to misconfigured server"),
+ ERROR_INVALID_QUERY_TRANSFORMATION("error.invalid_query_transformation", Unit.OPERATION, "Requests that failed due to invalid query transformation"),
+ ERROR_RESULTS_WITH_ERRORS("error.results_with_errors", Unit.OPERATION, "The number of queries with error payload"),
+ ERROR_UNSPECIFIED("error.unspecified", Unit.OPERATION, "Requests that failed for an unspecified reason"),
+ ERROR_UNHANDLED_EXCEPTION("error.unhandled_exception", Unit.OPERATION, "Requests that failed due to an unhandled exception"),
+
+ // Deprecated metrics. TODO: Remove in Vespa 9.
+ SERVER_REJECTED_REQUESTS("serverRejectedRequests", Unit.OPERATION, "Deprecated. Use jdisc.thread_pool.rejected_tasks instead."),
+ SERVER_THREAD_POOL_SIZE("serverThreadPoolSize", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.size instead."),
+ SERVER_ACTIVE_THREADS("serverActiveThreads", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.active_threads instead."),
+
+ // Java (JRT) TLS metrics
+ JRT_TRANSPORT_TLS_CERTIFICATE_VERIFICATION_FAILURES("jrt.transport.tls-certificate-verification-failures", Unit.FAILURE, "TLS certificate verification failures"),
+ JRT_TRANSPORT_PEER_AUTHORIZATION_FAILURES("jrt.transport.peer-authorization-failures", Unit.FAILURE, "TLS peer authorization failures"),
+ JRT_TRANSPORT_SERVER_TLS_CONNECIONTS_ESTABLISHED("jrt.transport.server.tls-connections-established", Unit.CONNECTION, "TLS server connections established"),
+ JRT_TRANSPORT_CLIENT_TLS_CONNECTIONS_ESTABLISHED("jrt.transport.client.tls-connections-established", Unit.CONNECTION, "TLS client connections established"),
+ JRT_TRANSPORT_SERVER_UNENCRYPTED_CONNECTIONS_ESTABLISHED("jrt.transport.server.unencrypted-connections-established", Unit.CONNECTION, "Unencrypted server connections established"),
+ JRT_TRANSPORT_CLIENT_UNENCRYPTED_CONNECTIONS_ESTABLISHED("jrt.transport.client.unencrypted-connections-established", Unit.CONNECTION, "Unencrypted client connections established"),
+
+ MAX_QUERY_LATENCY("max_query_latency", Unit.MILLISECOND, "Deprecated. Use query_latency.max instead"), // TODO: Remove in Vespa 9
+ MEAN_QUERY_LATENCY("mean_query_latency", Unit.MILLISECOND, "Deprecated. Use the expression (query_latency.sum / query_latency.count) instead"),// TODO: Remove in Vespa 9
+
+
+ // Metrics defined in com/yahoo/jdisc/http/filter/security/athenz/AthenzAuthorizationFilter.java
+ JDISC_HTTP_FILTER_ATHENZ_ACCEPTED_REQUESTS("jdisc.http.filter.athenz.accepted_requests", Unit.REQUEST, "Number of requests accepted by the AthenzAuthorization filter"),
+ JDISC_HTTP_FILTER_ATHENZ_REJECTED_REQUESTS("jdisc.http.filter.athenz.rejected_requests", Unit.REQUEST, "Number of requests rejected by the AthenzAuthorization filter"),
+
+
+ // Metrics defined in com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java
+ SERVER_CONNECTIONS_OPEN_MAX("serverConnectionsOpenMax", Unit.CONNECTION, "Maximum number of open connections"),
+ SERVER_CONNECTION_DURATION_MAX("serverConnectionDurationMax", Unit.MILLISECOND, "Longest duration a connection is kept open"),
+
+ SERVER_CONNECTION_DURATION_MEAN("serverConnectionDurationMean", Unit.MILLISECOND, "Average duration a connection is kept open"),
+ SERVER_CONNECTION_DURATION_STD_DEV("serverConnectionDurationStdDev", Unit.MILLISECOND, "Standard deviation of open connection duration"),
+ SERVER_NUM_REQUESTS("serverNumRequests", Unit.REQUEST, "Number of requests"),
+ SERVER_NUM_SUCCESSFUL_RESPONSES("serverNumSuccessfulResponses", Unit.REQUEST, "Number of successful responses"),
+ SERVER_NUM_FAILED_RESPONSES("serverNumFailedResponses", Unit.REQUEST, "Number of failed responses"),
+
+ SERVER_NUM_SUCCESSFUL_RESPONSE_WRITES("serverNumSuccessfulResponseWrites", Unit.REQUEST, "Number of successful response writes"),
+ SERVER_NUM_FAILED_RESPONSE_WRITES("serverNumFailedResponseWrites", Unit.REQUEST, "Number of failed response writes"),
+
+ SERVER_TOTAL_SUCCESFUL_RESPONSE_LATENCY("serverTotalSuccessfulResponseLatency", Unit.MILLISECOND, "Total duration for execution of successful responses"),
+ SERVER_TOTAL_FAILED_RESPONSE_LATENCY("serverTotalFailedResponseLatency", Unit.MILLISECOND, "Total duration for execution of failed responses"),
+ SERVER_TIME_TO_FIRST_BYTE("serverTimeToFirstByte", Unit.MILLISECOND, "Time from request has been received by the server until the first byte is returned to the client"),
+
+ SERVER_STARTED_MILLIS("serverStartedMillis", Unit.MILLISECOND, "Time since the service was started");
+
+
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ ContainerMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/DistributorMetrics.java b/metrics/src/main/java/ai/vespa/metrics/DistributorMetrics.java
new file mode 100644
index 00000000000..b2ebbc4d6aa
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/DistributorMetrics.java
@@ -0,0 +1,257 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngveaasheim
+ */
+public enum DistributorMetrics implements VespaMetrics {
+
+ VDS_IDEALSTATE_BUCKETS_RECHECKING("vds.idealstate.buckets_rechecking", Unit.BUCKET, "The number of buckets that we are rechecking for ideal state operations"),
+ VDS_IDEALSTATE_IDEALSTATE_DIFF("vds.idealstate.idealstate_diff", Unit.BUCKET, "A number representing the current difference from the ideal state. This is a number that decreases steadily as the system is getting closer to the ideal state"),
+ VDS_IDEALSTATE_BUCKETS_TOOFEWCOPIES("vds.idealstate.buckets_toofewcopies", Unit.BUCKET, "The number of buckets the distributor controls that have less than the desired redundancy"),
+ VDS_IDEALSTATE_BUCKETS_TOOMANYCOPIES("vds.idealstate.buckets_toomanycopies", Unit.BUCKET, "The number of buckets the distributor controls that have more than the desired redundancy"),
+ VDS_IDEALSTATE_BUCKETS("vds.idealstate.buckets", Unit.BUCKET, "The number of buckets the distributor controls"),
+ VDS_IDEALSTATE_BUCKETS_NOTRUSTED("vds.idealstate.buckets_notrusted", Unit.BUCKET, "The number of buckets that have no trusted copies."),
+ VDS_IDEALSTATE_BUCKET_REPLICAS_MOVING_OUT("vds.idealstate.bucket_replicas_moving_out", Unit.BUCKET, "Bucket replicas that should be moved out, e.g. retirement case or node added to cluster that has higher ideal state priority."),
+ VDS_IDEALSTATE_BUCKET_REPLICAS_COPYING_OUT("vds.idealstate.bucket_replicas_copying_out", Unit.BUCKET, "Bucket replicas that should be copied out, e.g. node is in ideal state but might have to provide data other nodes in a merge"),
+ VDS_IDEALSTATE_BUCKET_REPLICAS_COPYING_IN("vds.idealstate.bucket_replicas_copying_in", Unit.BUCKET, "Bucket replicas that should be copied in, e.g. node does not have a replica for a bucket that it is in ideal state for"),
+ VDS_IDEALSTATE_BUCKET_REPLICAS_SYNCING("vds.idealstate.bucket_replicas_syncing", Unit.BUCKET, "Bucket replicas that need syncing due to mismatching metadata"),
+ VDS_IDEALSTATE_MAX_OBSERVED_TIME_SINCE_LAST_GC_SEC("vds.idealstate.max_observed_time_since_last_gc_sec", Unit.SECOND, "Maximum time (in seconds) since GC was last successfully run for a bucket. Aggregated max value across all buckets on the distributor."),
+ VDS_IDEALSTATE_DELETE_BUCKET_DONE_OK("vds.idealstate.delete_bucket.done_ok", Unit.OPERATION, "The number of operations successfully performed"),
+ VDS_IDEALSTATE_DELETE_BUCKET_DONE_FAILED("vds.idealstate.delete_bucket.done_failed", Unit.OPERATION, "The number of operations that failed"),
+ VDS_IDEALSTATE_DELETE_BUCKET_PENDING("vds.idealstate.delete_bucket.pending", Unit.OPERATION, "The number of operations pending"),
+ VDS_IDEALSTATE_DELETE_BUCKET_BLOCKED("vds.idealstate.delete_bucket.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"),
+ VDS_IDEALSTATE_DELETE_BUCKET_THROTTLED("vds.idealstate.delete_bucket.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"),
+ VDS_IDEALSTATE_MERGE_BUCKET_DONE_OK("vds.idealstate.merge_bucket.done_ok", Unit.OPERATION, "The number of operations successfully performed"),
+ VDS_IDEALSTATE_MERGE_BUCKET_DONE_FAILED("vds.idealstate.merge_bucket.done_failed", Unit.OPERATION, "The number of operations that failed"),
+ VDS_IDEALSTATE_MERGE_BUCKET_PENDING("vds.idealstate.merge_bucket.pending", Unit.OPERATION, "The number of operations pending"),
+ VDS_IDEALSTATE_MERGE_BUCKET_BLOCKED("vds.idealstate.merge_bucket.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"),
+ VDS_IDEALSTATE_MERGE_BUCKET_THROTTLED("vds.idealstate.merge_bucket.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"),
+ VDS_IDEALSTATE_MERGE_BUCKET_SOURCE_ONLY_COPY_CHANGED("vds.idealstate.merge_bucket.source_only_copy_changed", Unit.OPERATION, "The number of merge operations where source-only copy changed"),
+ VDS_IDEALSTATE_MERGE_BUCKET_SOURCE_ONLY_COPY_DELETE_BLOCKED("vds.idealstate.merge_bucket.source_only_copy_delete_blocked", Unit.OPERATION, "The number of merge operations where delete of unchanged source-only copies was blocked"),
+ VDS_IDEALSTATE_MERGE_BUCKET_SOURCE_ONLY_COPY_DELETE_FAILED("vds.idealstate.merge_bucket.source_only_copy_delete_failed", Unit.OPERATION, "The number of merge operations where delete of unchanged source-only copies failed"),
+ VDS_IDEALSTATE_SPLIT_BUCKET_DONE_OK("vds.idealstate.split_bucket.done_ok", Unit.OPERATION, "The number of operations successfully performed"),
+ VDS_IDEALSTATE_SPLIT_BUCKET_DONE_FAILED("vds.idealstate.split_bucket.done_failed", Unit.OPERATION, "The number of operations that failed"),
+ VDS_IDEALSTATE_SPLIT_BUCKET_PENDING("vds.idealstate.split_bucket.pending", Unit.OPERATION, "The number of operations pending"),
+ VDS_IDEALSTATE_SPLIT_BUCKET_BLOCKED("vds.idealstate.split_bucket.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"),
+ VDS_IDEALSTATE_SPLIT_BUCKET_THROTTLED("vds.idealstate.split_bucket.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"),
+ VDS_IDEALSTATE_JOIN_BUCKET_DONE_OK("vds.idealstate.join_bucket.done_ok", Unit.OPERATION, "The number of operations successfully performed"),
+ VDS_IDEALSTATE_JOIN_BUCKET_DONE_FAILED("vds.idealstate.join_bucket.done_failed", Unit.OPERATION, "The number of operations that failed"),
+ VDS_IDEALSTATE_JOIN_BUCKET_PENDING("vds.idealstate.join_bucket.pending", Unit.OPERATION, "The number of operations pending"),
+ VDS_IDEALSTATE_JOIN_BUCKET_BLOCKED("vds.idealstate.join_bucket.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"),
+ VDS_IDEALSTATE_JOIN_BUCKET_THROTTLED("vds.idealstate.join_bucket.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"),
+ VDS_IDEALSTATE_GARBAGE_COLLECTION_DONE_OK("vds.idealstate.garbage_collection.done_ok", Unit.OPERATION, "The number of operations successfully performed"),
+ VDS_IDEALSTATE_GARBAGE_COLLECTION_DONE_FAILED("vds.idealstate.garbage_collection.done_failed", Unit.OPERATION, "The number of operations that failed"),
+ VDS_IDEALSTATE_GARBAGE_COLLECTION_PENDING("vds.idealstate.garbage_collection.pending", Unit.OPERATION, "The number of operations pending"),
+ VDS_IDEALSTATE_GARBAGE_COLLECTION_DOCUMENTS_REMOVED("vds.idealstate.garbage_collection.documents_removed", Unit.DOCUMENT, "Number of documents removed by GC operations"),
+ VDS_IDEALSTATE_GARBAGE_COLLECTION_BLOCKED("vds.idealstate.garbage_collection.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"),
+ VDS_IDEALSTATE_GARBAGE_COLLECTION_THROTTLED("vds.idealstate.garbage_collection.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"),
+
+ VDS_DISTRIBUTOR_PUTS_LATENCY("vds.distributor.puts.latency", Unit.MILLISECOND, "The latency of put operations"),
+ VDS_DISTRIBUTOR_PUTS_OK("vds.distributor.puts.ok", Unit.OPERATION, "The number of successful put operations performed"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_TOTAL("vds.distributor.puts.failures.total", Unit.OPERATION, "Sum of all failures"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_NOTFOUND("vds.distributor.puts.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.puts.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.puts.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_NOTCONNECTED("vds.distributor.puts.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_NOTREADY("vds.distributor.puts.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.puts.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.puts.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_STORAGEFAILURE("vds.distributor.puts.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_TIMEOUT("vds.distributor.puts.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_BUSY("vds.distributor.puts.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_PUTS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.puts.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_REMOVES_LATENCY("vds.distributor.removes.latency", Unit.MILLISECOND, "The latency of remove operations"),
+ VDS_DISTRIBUTOR_REMOVES_OK("vds.distributor.removes.ok", Unit.OPERATION, "The number of successful removes operations performed"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_TOTAL("vds.distributor.removes.failures.total", Unit.OPERATION, "Sum of all failures"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_NOTFOUND("vds.distributor.removes.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_TEST_AND_SET_FAILED("vds.distributor.removes.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.removes.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_BUSY("vds.distributor.removes.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_INCONSISTENT_BUCKET("vds.distributor.removes.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_NOTCONNECTED("vds.distributor.removes.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_NOTREADY("vds.distributor.removes.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.removes.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_STORAGEFAILURE("vds.distributor.removes.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_TIMEOUT("vds.distributor.removes.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_REMOVES_FAILURES_WRONGDISTRIBUTOR("vds.distributor.removes.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_UPDATES_LATENCY("vds.distributor.updates.latency", Unit.MILLISECOND, "The latency of update operations"),
+ VDS_DISTRIBUTOR_UPDATES_OK("vds.distributor.updates.ok", Unit.OPERATION, "The number of successful updates operations performed"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_TOTAL("vds.distributor.updates.failures.total", Unit.OPERATION, "Sum of all failures"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_NOTFOUND("vds.distributor.updates.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_TEST_AND_SET_FAILED("vds.distributor.updates.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.updates.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_UPDATES_DIVERGING_TIMESTAMP_UPDATES("vds.distributor.updates.diverging_timestamp_updates", Unit.OPERATION, "Number of updates that report they were performed against divergent version timestamps on different replicas"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_BUSY("vds.distributor.updates.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_INCONSISTENT_BUCKET("vds.distributor.updates.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_NOTCONNECTED("vds.distributor.updates.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_NOTREADY("vds.distributor.updates.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.updates.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_STORAGEFAILURE("vds.distributor.updates.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_TIMEOUT("vds.distributor.updates.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_UPDATES_FAILURES_WRONGDISTRIBUTOR("vds.distributor.updates.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_UPDATES_FAST_PATH_RESTARTS("vds.distributor.updates.fast_path_restarts", Unit.OPERATION, "Number of safe path (write repair) updates that were restarted as fast path updates because all replicas returned documents with the same timestamp in the initial read phase"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_OK("vds.distributor.removelocations.ok", Unit.OPERATION, "The number of successful removelocations operations performed"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_TOTAL("vds.distributor.removelocations.failures.total", Unit.OPERATION, "Sum of all failures"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_BUSY("vds.distributor.removelocations.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.removelocations.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.removelocations.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_NOTCONNECTED("vds.distributor.removelocations.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_NOTFOUND("vds.distributor.removelocations.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_NOTREADY("vds.distributor.removelocations.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.removelocations.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_STORAGEFAILURE("vds.distributor.removelocations.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.removelocations.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_TIMEOUT("vds.distributor.removelocations.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.removelocations.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_REMOVELOCATIONS_LATENCY("vds.distributor.removelocations.latency", Unit.MILLISECOND, "The average latency of removelocations operations"),
+ VDS_DISTRIBUTOR_GETS_LATENCY("vds.distributor.gets.latency", Unit.MILLISECOND, "The average latency of gets operations"),
+ VDS_DISTRIBUTOR_GETS_OK("vds.distributor.gets.ok", Unit.OPERATION, "The number of successful gets operations performed"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_TOTAL("vds.distributor.gets.failures.total", Unit.OPERATION, "Sum of all failures"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_NOTFOUND("vds.distributor.gets.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_BUSY("vds.distributor.gets.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.gets.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.gets.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_NOTCONNECTED("vds.distributor.gets.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_NOTREADY("vds.distributor.gets.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.gets.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_STORAGEFAILURE("vds.distributor.gets.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.gets.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_TIMEOUT("vds.distributor.gets.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_GETS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.gets.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_VISITOR_LATENCY("vds.distributor.visitor.latency", Unit.MILLISECOND, "The average latency of visitor operations"),
+ VDS_DISTRIBUTOR_VISITOR_OK("vds.distributor.visitor.ok", Unit.OPERATION, "The number of successful visitor operations performed"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_TOTAL("vds.distributor.visitor.failures.total", Unit.OPERATION, "Sum of all failures"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_NOTREADY("vds.distributor.visitor.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_NOTCONNECTED("vds.distributor.visitor.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_WRONGDISTRIBUTOR("vds.distributor.visitor.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.visitor.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_STORAGEFAILURE("vds.distributor.visitor.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_TIMEOUT("vds.distributor.visitor.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_BUSY("vds.distributor.visitor.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_INCONSISTENT_BUCKET("vds.distributor.visitor.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_NOTFOUND("vds.distributor.visitor.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_VISITOR_BYTES_PER_VISITOR("vds.distributor.visitor.bytes_per_visitor", Unit.OPERATION, "The number of bytes visited on content nodes as part of a single client visitor command"),
+ VDS_DISTRIBUTOR_VISITOR_DOCS_PER_VISITOR("vds.distributor.visitor.docs_per_visitor", Unit.OPERATION, "The number of documents visited on content nodes as part of a single client visitor command"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.visitor.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_VISITOR_FAILURES_TEST_AND_SET_FAILED("vds.distributor.visitor.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+
+ VDS_DISTRIBUTOR_DOCSSTORED("vds.distributor.docsstored", Unit.DOCUMENT, "Number of documents stored in all buckets controlled by this distributor"),
+ VDS_DISTRIBUTOR_BYTESSTORED("vds.distributor.bytesstored", Unit.BYTE, "Number of bytes stored in all buckets controlled by this distributor"),
+
+ METRICMANAGER_PERIODICHOOKLATENCY("metricmanager.periodichooklatency", Unit.MILLISECOND, "Time in ms used to update a single periodic hook"),
+ METRICMANAGER_RESETLATENCY("metricmanager.resetlatency", Unit.MILLISECOND, "Time in ms used to reset all metrics."),
+ METRICMANAGER_SLEEPTIME("metricmanager.sleeptime", Unit.MILLISECOND, "Time in ms worker thread is sleeping"),
+ METRICMANAGER_SNAPSHOTHOOKLATENCY("metricmanager.snapshothooklatency", Unit.MILLISECOND, "Time in ms used to update a single snapshot hook"),
+ METRICMANAGER_SNAPSHOTLATENCY("metricmanager.snapshotlatency", Unit.MILLISECOND, "Time in ms used to take a snapshot"),
+ VDS_DISTRIBUTOR_ACTIVATE_CLUSTER_STATE_PROCESSING_TIME("vds.distributor.activate_cluster_state_processing_time", Unit.MILLISECOND, "Elapsed time where the distributor thread is blocked on merging pending bucket info into its bucket database upon activating a cluster state"),
+ VDS_DISTRIBUTOR_BUCKET_DB_MEMORY_USAGE_ALLOCATED_BYTES("vds.distributor.bucket_db.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ VDS_DISTRIBUTOR_BUCKET_DB_MEMORY_USAGE_DEAD_BYTES("vds.distributor.bucket_db.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ VDS_DISTRIBUTOR_BUCKET_DB_MEMORY_USAGE_ONHOLD_BYTES("vds.distributor.bucket_db.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+ VDS_DISTRIBUTOR_BUCKET_DB_MEMORY_USAGE_USED_BYTES("vds.distributor.bucket_db.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_BUSY("vds.distributor.getbucketlists.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.getbucketlists.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.getbucketlists.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_NOTCONNECTED("vds.distributor.getbucketlists.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_NOTFOUND("vds.distributor.getbucketlists.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_NOTREADY("vds.distributor.getbucketlists.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.getbucketlists.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_STORAGEFAILURE("vds.distributor.getbucketlists.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.getbucketlists.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_TIMEOUT("vds.distributor.getbucketlists.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_TOTAL("vds.distributor.getbucketlists.failures.total", Unit.OPERATION, "Total number of failures"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.getbucketlists.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_LATENCY("vds.distributor.getbucketlists.latency", Unit.MILLISECOND, "The average latency of getbucketlists operations"),
+ VDS_DISTRIBUTOR_GETBUCKETLISTS_OK("vds.distributor.getbucketlists.ok", Unit.OPERATION, "The number of successful getbucketlists operations performed"),
+ VDS_DISTRIBUTOR_RECOVERYMODESCHEDULINGTIME("vds.distributor.recoverymodeschedulingtime", Unit.MILLISECOND, "Time spent scheduling operations in recovery mode after receiving new cluster state"),
+ VDS_DISTRIBUTOR_SET_CLUSTER_STATE_PROCESSING_TIME("vds.distributor.set_cluster_state_processing_time", Unit.MILLISECOND, "Elapsed time where the distributor thread is blocked on processing its bucket database upon receiving a new cluster state"),
+ VDS_DISTRIBUTOR_STATE_TRANSITION_TIME("vds.distributor.state_transition_time", Unit.MILLISECOND, "Time it takes to complete a cluster state transition. If a state transition is preempted before completing, its elapsed time is counted as part of the total time spent for the final, completed state transition"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_BUSY("vds.distributor.stats.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.stats.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.stats.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_NOTCONNECTED("vds.distributor.stats.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_NOTFOUND("vds.distributor.stats.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_NOTREADY("vds.distributor.stats.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.stats.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_STORAGEFAILURE("vds.distributor.stats.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.stats.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_TIMEOUT("vds.distributor.stats.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_TOTAL("vds.distributor.stats.failures.total", Unit.OPERATION, "The total number of failures"),
+ VDS_DISTRIBUTOR_STATS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.stats.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_STATS_LATENCY("vds.distributor.stats.latency", Unit.MILLISECOND, "The average latency of stats operations"),
+ VDS_DISTRIBUTOR_STATS_OK("vds.distributor.stats.ok", Unit.OPERATION, "The number of successful stats operations performed"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_BUSY("vds.distributor.update_gets.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.update_gets.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.update_gets.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_NOTCONNECTED("vds.distributor.update_gets.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_NOTFOUND("vds.distributor.update_gets.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_NOTREADY("vds.distributor.update_gets.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.update_gets.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_STORAGEFAILURE("vds.distributor.update_gets.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.update_gets.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_TIMEOUT("vds.distributor.update_gets.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_TOTAL("vds.distributor.update_gets.failures.total", Unit.OPERATION, "The total number of failures"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.update_gets.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_LATENCY("vds.distributor.update_gets.latency", Unit.MILLISECOND, "The average latency of update_gets operations"),
+ VDS_DISTRIBUTOR_UPDATE_GETS_OK("vds.distributor.update_gets.ok", Unit.OPERATION, "The number of successful update_gets operations performed"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_BUSY("vds.distributor.update_metadata_gets.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.update_metadata_gets.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.update_metadata_gets.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_NOTCONNECTED("vds.distributor.update_metadata_gets.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_NOTFOUND("vds.distributor.update_metadata_gets.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_NOTREADY("vds.distributor.update_metadata_gets.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.update_metadata_gets.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_STORAGEFAILURE("vds.distributor.update_metadata_gets.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.update_metadata_gets.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_TIMEOUT("vds.distributor.update_metadata_gets.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_TOTAL("vds.distributor.update_metadata_gets.failures.total", Unit.OPERATION, "The total number of failures"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.update_metadata_gets.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_LATENCY("vds.distributor.update_metadata_gets.latency", Unit.MILLISECOND, "The average latency of update_metadata_gets operations"),
+ VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_OK("vds.distributor.update_metadata_gets.ok", Unit.OPERATION, "The number of successful update_metadata_gets operations performed"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_BUSY("vds.distributor.update_puts.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.update_puts.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.update_puts.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_NOTCONNECTED("vds.distributor.update_puts.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_NOTFOUND("vds.distributor.update_puts.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_NOTREADY("vds.distributor.update_puts.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.update_puts.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_STORAGEFAILURE("vds.distributor.update_puts.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.update_puts.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_TIMEOUT("vds.distributor.update_puts.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_TOTAL("vds.distributor.update_puts.failures.total", Unit.OPERATION, "The total number of put failures"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.update_puts.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_LATENCY("vds.distributor.update_puts.latency", Unit.MILLISECOND, "The average latency of update_puts operations"),
+ VDS_DISTRIBUTOR_UPDATE_PUTS_OK("vds.distributor.update_puts.ok", Unit.OPERATION, "The number of successful update_puts operations performed"),
+
+ VDS_IDEALSTATE_NODES_PER_MERGE("vds.idealstate.nodes_per_merge", Unit.NODE, "The number of nodes involved in a single merge operation."),
+ VDS_IDEALSTATE_SET_BUCKET_STATE_BLOCKED("vds.idealstate.set_bucket_state.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"),
+ VDS_IDEALSTATE_SET_BUCKET_STATE_DONE_FAILED("vds.idealstate.set_bucket_state.done_failed", Unit.OPERATION, "The number of operations that failed"),
+ VDS_IDEALSTATE_SET_BUCKET_STATE_DONE_OK("vds.idealstate.set_bucket_state.done_ok", Unit.OPERATION, "The number of operations successfully performed"),
+ VDS_IDEALSTATE_SET_BUCKET_STATE_PENDING("vds.idealstate.set_bucket_state.pending", Unit.OPERATION, "The number of operations pending"),
+ VDS_IDEALSTATE_SET_BUCKET_STATE_THROTTLED("vds.idealstate.set_bucket_state.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"),
+
+ VDS_BOUNCER_CLOCK_SKEW_ABORTS("vds.bouncer.clock_skew_aborts", Unit.OPERATION, "Number of client operations that were aborted due to clock skew between sender and receiver exceeding acceptable range");
+
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ DistributorMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/HostedNodeAdminMetrics.java b/metrics/src/main/java/ai/vespa/metrics/HostedNodeAdminMetrics.java
new file mode 100644
index 00000000000..927672a43f7
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/HostedNodeAdminMetrics.java
@@ -0,0 +1,61 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngveaasheim
+ */
+
+// TODO: Move to hosted repo.
+public enum HostedNodeAdminMetrics implements VespaMetrics {
+
+ // System metrics
+ CPU_UTIL("cpu.util", Unit.PERCENTAGE, "CPU utilisation"),
+ CPU_SYS_UTIL("cpu.sys.util", Unit.PERCENTAGE, "System CPU utilisation"),
+ CPU_THROTTLED_TIME("cpu.throttled_time.rate", Unit.PERCENTAGE, "Part of the time CPU is exhausted (CPU throttling enforced)"),
+ CPU_THROTTLED_CPU_TIME("cpu.throttled_cpu_time.rate", Unit.PERCENTAGE, "Part of the time CPU is exhausted (CPU throttling enforced)"),
+ CPU_VCPUS("cpu.vcpus", Unit.ITEM, "Number of virtual CPU threads allocation to the node"),
+ DISK_LIMIT("disk.limit", Unit.BYTE, "Amount of disk space available on the node"),
+ DISK_USED("disk.used", Unit.BYTE, "Amount of disk space used by the node"),
+ DISK_UTIL("disk.util", Unit.PERCENTAGE, "Disk space utilisation"),
+ MEM_LIMIT("mem.limit", Unit.BYTE, "Amount of memory available on the node"),
+ MEM_USED("mem.used", Unit.BYTE, "Amount of memory used by the node"),
+ MEM_UTIL("mem.util", Unit.PERCENTAGE, "Memory utilisation"),
+ MEM_TOTAL_USED("mem_total.used", Unit.BYTE, "Total amount of memory used by the node, including OS buffer caches"),
+ MEM_TOTAL_UTIL("mem_total.util", Unit.PERCENTAGE, "Total memory utilisation"),
+ GPU_UTIL("gpu.util", Unit.PERCENTAGE, "GPU utilisation"),
+ GPU_MEM_USED("gpu.memory.used", Unit.BYTE, "GPU memory used"),
+ GPU_MEM_TOTAL("gpu.memory.total", Unit.BYTE, "GPU memory available"),
+
+
+ // Network metrics
+ NET_IN_BYTES("net.in.bytes", Unit.BYTE, "Network bytes received (rxBytes) (COUNT metric)"),
+ NET_IN_ERROR("net.in.errors", Unit.FAILURE, "Network receive errors (rxErrors)"),
+ NET_IN_DROPPED("net.in.dropped", Unit.PACKET, "Inbound network packets dropped (rxDropped)"),
+ NET_OUT_BYTES("net.in.bytes", Unit.BYTE, "Network bytes sent (txBytes) (COUNT metric)"),
+ NET_OUT_ERROR("net.in.errors", Unit.FAILURE, "Network send errors (txErrors)"),
+ NET_OUT_DROPPED("net.in.dropped", Unit.PACKET, "Outbound network packets dropped (txDropped)"),
+ BANDWIDTH_LIMIT("bandwidth.limit", Unit.BYTE_PER_SECOND, "Available network bandwidth");
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ HostedNodeAdminMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
+
diff --git a/metrics/src/main/java/ai/vespa/metrics/LogdMetrics.java b/metrics/src/main/java/ai/vespa/metrics/LogdMetrics.java
new file mode 100644
index 00000000000..79122e3b922
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/LogdMetrics.java
@@ -0,0 +1,33 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngveaasheim
+ */
+public enum LogdMetrics implements VespaMetrics {
+
+ LOGD_PROCESSED_LINES("logd.processed.lines", Unit.ITEM, "Number of log lines processed");
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ LogdMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
+
diff --git a/metrics/src/main/java/ai/vespa/metrics/NodeAdminMetrics.java b/metrics/src/main/java/ai/vespa/metrics/NodeAdminMetrics.java
new file mode 100644
index 00000000000..74da68dbcb7
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/NodeAdminMetrics.java
@@ -0,0 +1,35 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngveaasheim
+ */
+public enum NodeAdminMetrics implements VespaMetrics {
+
+ ENDPOINT_CERTIFICATE_EXPIRY_SECONDS("endpoint.certificate.expiry.seconds", Unit.SECOND, "Time until node endpoint certificate expires"),
+ NODE_CERTIFICATE_EXPIRY_SECONDS("node-certificate.expiry.seconds", Unit.SECOND, "Time until node certificate expires");
+
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ NodeAdminMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
+
diff --git a/metrics/src/main/java/ai/vespa/metrics/RoutingLayerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/RoutingLayerMetrics.java
new file mode 100644
index 00000000000..cf35cdae90e
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/RoutingLayerMetrics.java
@@ -0,0 +1,34 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngveaasheim
+ */
+
+// Internal hosted Vespa only TODO: Move to a better place
+public enum RoutingLayerMetrics implements VespaMetrics {
+
+ WORKER_CONNECTIONS("worker.connections", Unit.CONNECTION, "Yahoo! Internal: Number of connections for the routing worker having most connections per node");
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ RoutingLayerMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/SearchNodeMetrics.java b/metrics/src/main/java/ai/vespa/metrics/SearchNodeMetrics.java
new file mode 100644
index 00000000000..d6018dc0633
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/SearchNodeMetrics.java
@@ -0,0 +1,299 @@
+package ai.vespa.metrics;
+
+/**
+ * @author gjoranv
+ */
+public enum SearchNodeMetrics implements VespaMetrics {
+
+ CONTENT_PROTON_CONFIG_GENERATION("content.proton.config.generation", Unit.VERSION, "The oldest config generation used by this search node"),
+
+ CONTENT_PROTON_DOCUMENTDB_DOCUMENTS_TOTAL("content.proton.documentdb.documents.total", Unit.DOCUMENT, "The total number of documents in this documents db (ready + not-ready)"),
+ CONTENT_PROTON_DOCUMENTDB_DOCUMENTS_READY("content.proton.documentdb.documents.ready", Unit.DOCUMENT, "The number of ready documents in this document db"),
+ CONTENT_PROTON_DOCUMENTDB_DOCUMENTS_ACTIVE("content.proton.documentdb.documents.active", Unit.DOCUMENT, "The number of active / searchable documents in this document db"),
+ CONTENT_PROTON_DOCUMENTDB_DOCUMENTS_REMOVED("content.proton.documentdb.documents.removed", Unit.DOCUMENT, "The number of removed documents in this document db"),
+
+ CONTENT_PROTON_DOCUMENTDB_INDEX_DOCS_IN_MEMORY("content.proton.documentdb.index.docs_in_memory", Unit.DOCUMENT, "Number of documents in memory index"),
+ CONTENT_PROTON_DOCUMENTDB_DISK_USAGE("content.proton.documentdb.disk_usage", Unit.BYTE, "The total disk usage (in bytes) for this document db"),
+ CONTENT_PROTON_DOCUMENTDB_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ CONTENT_PROTON_DOCUMENTDB_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+ CONTENT_PROTON_DOCUMENTDB_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_HEART_BEAT_AGE("content.proton.documentdb.heart_beat_age", Unit.SECOND, "How long ago (in seconds) heart beat maintenace job was run"),
+ CONTENT_PROTON_DOCSUM_COUNT("content.proton.docsum.count", Unit.REQUEST, "Docsum requests handled"),
+ CONTENT_PROTON_DOCSUM_DOCS("content.proton.docsum.docs", Unit.DOCUMENT, "Total docsums returned"),
+ CONTENT_PROTON_DOCSUM_LATENCY("content.proton.docsum.latency", Unit.MILLISECOND, "Docsum request latency"),
+
+ // Search protocol
+ CONTENT_PROTON_SEARCH_PROTOCOL_QUERY_LATENCY("content.proton.search_protocol.query.latency", Unit.SECOND, "Query request latency (seconds)"),
+ CONTENT_PROTON_SEARCH_PROTOCOL_QUERY_REQUEST_SIZE("content.proton.search_protocol.query.request_size", Unit.BYTE, "Query request size (network bytes)"),
+ CONTENT_PROTON_SEARCH_PROTOCOL_QUERY_REPLY_SIZE("content.proton.search_protocol.query.reply_size", Unit.BYTE, "Query reply size (network bytes)"),
+ CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_LATENCY("content.proton.search_protocol.docsum.latency", Unit.SECOND, "Docsum request latency (seconds)"),
+ CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_REQUEST_SIZE("content.proton.search_protocol.docsum.request_size", Unit.BYTE, "Docsum request size (network bytes)"),
+ CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_REPLY_SIZE("content.proton.search_protocol.docsum.reply_size", Unit.BYTE, "Docsum reply size (network bytes)"),
+ CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_REQUESTED_DOCUMENTS("content.proton.search_protocol.docsum.requested_documents", Unit.DOCUMENT, "Total requested document summaries"),
+
+ // Executors shared between all document dbs
+ CONTENT_PROTON_EXECUTOR_PROTON_QUEUESIZE("content.proton.executor.proton.queuesize", Unit.TASK, "Size of executor proton task queue"),
+ CONTENT_PROTON_EXECUTOR_PROTON_ACCEPTED("content.proton.executor.proton.accepted", Unit.TASK, "Number of executor proton accepted tasks"),
+ CONTENT_PROTON_EXECUTOR_PROTON_WAKEUPS("content.proton.executor.proton.wakeups", Unit.WAKEUP, "Number of times a executor proton worker thread has been woken up"),
+ CONTENT_PROTON_EXECUTOR_PROTON_UTILIZATION("content.proton.executor.proton.utilization", Unit.FRACTION, "Ratio of time the executor proton worker threads has been active"),
+ CONTENT_PROTON_EXECUTOR_PROTON_REJECTED("content.proton.executor.proton.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_EXECUTOR_FLUSH_QUEUESIZE("content.proton.executor.flush.queuesize", Unit.TASK, "Size of executor flush task queue"),
+ CONTENT_PROTON_EXECUTOR_FLUSH_ACCEPTED("content.proton.executor.flush.accepted", Unit.TASK, "Number of accepted executor flush tasks"),
+ CONTENT_PROTON_EXECUTOR_FLUSH_WAKEUPS("content.proton.executor.flush.wakeups", Unit.WAKEUP, "Number of times a executor flush worker thread has been woken up"),
+ CONTENT_PROTON_EXECUTOR_FLUSH_UTILIZATION("content.proton.executor.flush.utilization", Unit.FRACTION, "Ratio of time the executor flush worker threads has been active"),
+ CONTENT_PROTON_EXECUTOR_FLUSH_REJECTED("content.proton.executor.flush.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_EXECUTOR_MATCH_QUEUESIZE("content.proton.executor.match.queuesize", Unit.TASK, "Size of executor match task queue"),
+ CONTENT_PROTON_EXECUTOR_MATCH_ACCEPTED("content.proton.executor.match.accepted", Unit.TASK, "Number of accepted executor match tasks"),
+ CONTENT_PROTON_EXECUTOR_MATCH_WAKEUPS("content.proton.executor.match.wakeups", Unit.WAKEUP, "Number of times a executor match worker thread has been woken up"),
+ CONTENT_PROTON_EXECUTOR_MATCH_UTILIZATION("content.proton.executor.match.utilization", Unit.FRACTION, "Ratio of time the executor match worker threads has been active"),
+ CONTENT_PROTON_EXECUTOR_MATCH_REJECTED("content.proton.executor.match.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_EXECUTOR_DOCSUM_QUEUESIZE("content.proton.executor.docsum.queuesize", Unit.TASK, "Size of executor docsum task queue"),
+ CONTENT_PROTON_EXECUTOR_DOCSUM_ACCEPTED("content.proton.executor.docsum.accepted", Unit.TASK, "Number of executor accepted docsum tasks"),
+ CONTENT_PROTON_EXECUTOR_DOCSUM_WAKEUPS("content.proton.executor.docsum.wakeups", Unit.WAKEUP, "Number of times a executor docsum worker thread has been woken up"),
+ CONTENT_PROTON_EXECUTOR_DOCSUM_UTILIZATION("content.proton.executor.docsum.utilization", Unit.FRACTION, "Ratio of time the executor docsum worker threads has been active"),
+ CONTENT_PROTON_EXECUTOR_DOCSUM_REJECTED("content.proton.executor.docsum.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_EXECUTOR_SHARED_QUEUESIZE("content.proton.executor.shared.queuesize", Unit.TASK, "Size of executor shared task queue"),
+ CONTENT_PROTON_EXECUTOR_SHARED_ACCEPTED("content.proton.executor.shared.accepted", Unit.TASK, "Number of executor shared accepted tasks"),
+ CONTENT_PROTON_EXECUTOR_SHARED_WAKEUPS("content.proton.executor.shared.wakeups", Unit.WAKEUP, "Number of times a executor shared worker thread has been woken up"),
+ CONTENT_PROTON_EXECUTOR_SHARED_UTILIZATION("content.proton.executor.shared.utilization", Unit.FRACTION, "Ratio of time the executor shared worker threads has been active"),
+ CONTENT_PROTON_EXECUTOR_SHARED_REJECTED("content.proton.executor.shared.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_EXECUTOR_WARMUP_QUEUESIZE("content.proton.executor.warmup.queuesize", Unit.TASK, "Size of executor warmup task queue"),
+ CONTENT_PROTON_EXECUTOR_WARMUP_ACCEPTED("content.proton.executor.warmup.accepted", Unit.TASK, "Number of accepted executor warmup tasks"),
+ CONTENT_PROTON_EXECUTOR_WARMUP_WAKEUPS("content.proton.executor.warmup.wakeups", Unit.WAKEUP, "Number of times a warmup executor worker thread has been woken up"),
+ CONTENT_PROTON_EXECUTOR_WARMUP_UTILIZATION("content.proton.executor.warmup.utilization", Unit.FRACTION, "Ratio of time the executor warmup worker threads has been active"),
+ CONTENT_PROTON_EXECUTOR_WARMUP_REJECTED("content.proton.executor.warmup.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_EXECUTOR_FIELD_WRITER_QUEUESIZE("content.proton.executor.field_writer.queuesize", Unit.TASK, "Size of executor field writer task queue"),
+ CONTENT_PROTON_EXECUTOR_FIELD_WRITER_ACCEPTED("content.proton.executor.field_writer.accepted", Unit.TASK, "Number of accepted executor field writer tasks"),
+ CONTENT_PROTON_EXECUTOR_FIELD_WRITER_WAKEUPS("content.proton.executor.field_writer.wakeups", Unit.WAKEUP, "Number of times a executor field writer worker thread has been woken up"),
+ CONTENT_PROTON_EXECUTOR_FIELD_WRITER_UTILIZATION("content.proton.executor.field_writer.utilization", Unit.FRACTION, "Ratio of time the executor fieldwriter worker threads has been active"),
+ CONTENT_PROTON_EXECUTOR_FIELD_WRITER_REJECTED("content.proton.executor.field_writer.rejected", Unit.TASK, "Number of rejected tasks"),
+
+ // jobs
+ CONTENT_PROTON_DOCUMENTDB_JOB_TOTAL("content.proton.documentdb.job.total", Unit.FRACTION, "The job load average total of all job metrics"),
+ CONTENT_PROTON_DOCUMENTDB_JOB_ATTRIBUTE_FLUSH("content.proton.documentdb.job.attribute_flush", Unit.FRACTION, "Flushing of attribute vector(s) to disk"),
+ CONTENT_PROTON_DOCUMENTDB_JOB_MEMORY_INDEX_FLUSH("content.proton.documentdb.job.memory_index_flush", Unit.FRACTION, "Flushing of memory index to disk"),
+ CONTENT_PROTON_DOCUMENTDB_JOB_DISK_INDEX_FUSION("content.proton.documentdb.job.disk_index_fusion", Unit.FRACTION, "Fusion of disk indexes"),
+ CONTENT_PROTON_DOCUMENTDB_JOB_DOCUMENT_STORE_FLUSH("content.proton.documentdb.job.document_store_flush", Unit.FRACTION, "Flushing of document store to disk"),
+ CONTENT_PROTON_DOCUMENTDB_JOB_DOCUMENT_STORE_COMPACT("content.proton.documentdb.job.document_store_compact", Unit.FRACTION, "Compaction of document store on disk"),
+ CONTENT_PROTON_DOCUMENTDB_JOB_BUCKET_MOVE("content.proton.documentdb.job.bucket_move", Unit.FRACTION, "Moving of buckets between 'ready' and 'notready' sub databases"),
+ CONTENT_PROTON_DOCUMENTDB_JOB_LID_SPACE_COMPACT("content.proton.documentdb.job.lid_space_compact", Unit.FRACTION, "Compaction of lid space in document meta store and attribute vectors"),
+ CONTENT_PROTON_DOCUMENTDB_JOB_REMOVED_DOCUMENTS_PRUNE("content.proton.documentdb.job.removed_documents_prune", Unit.FRACTION, "Pruning of removed documents in 'removed' sub database"),
+
+ // Threading service (per document db)
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_QUEUESIZE("content.proton.documentdb.threading_service.master.queuesize", Unit.TASK, "Size of threading service master task queue"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_ACCEPTED("content.proton.documentdb.threading_service.master.accepted", Unit.TASK, "Number of accepted threading service master tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_WAKEUPS("content.proton.documentdb.threading_service.master.wakeups", Unit.WAKEUP, "Number of times a threading service master worker thread has been woken up"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_UTILIZATION("content.proton.documentdb.threading_service.master.utilization", Unit.FRACTION, "Ratio of time the threading service master worker threads has been active"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_REJECTED("content.proton.documentdb.threading_service.master.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_QUEUESIZE("content.proton.documentdb.threading_service.index.queuesize", Unit.TASK, "Size of threading service index task queue"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_ACCEPTED("content.proton.documentdb.threading_service.index.accepted", Unit.TASK, "Number of accepted threading service index tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_WAKEUPS("content.proton.documentdb.threading_service.index.wakeups", Unit.WAKEUP, "Number of times a threading service index worker thread has been woken up"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_UTILIZATION("content.proton.documentdb.threading_service.index.utilization", Unit.FRACTION, "Ratio of time the threading service index worker threads has been active"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_REJECTED("content.proton.documentdb.threading_service.index.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_QUEUESIZE("content.proton.documentdb.threading_service.summary.queuesize", Unit.TASK, "Size of threading service summary task queue"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_ACCEPTED("content.proton.documentdb.threading_service.summary.accepted", Unit.TASK, "Number of accepted threading service summary tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_WAKEUPS("content.proton.documentdb.threading_service.summary.wakeups", Unit.WAKEUP, "Number of times a threading service summary worker thread has been woken up"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_UTILIZATION("content.proton.documentdb.threading_service.summary.utilization", Unit.FRACTION, "Ratio of time the threading service summary worker threads has been active"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_REJECTED("content.proton.documentdb.threading_service.summary.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_ACCEPTED("content.proton.documentdb.threading_service.attribute_field_writer.accepted", Unit.TASK, "Number of accepted tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_QUEUESIZE("content.proton.documentdb.threading_service.attribute_field_writer.queuesize", Unit.TASK, "Size of task queue"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_REJECTED("content.proton.documentdb.threading_service.attribute_field_writer.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_UTILIZATION("content.proton.documentdb.threading_service.attribute_field_writer.utilization", Unit.FRACTION, "Ratio of time the worker threads has been active"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_WAKEUPS("content.proton.documentdb.threading_service.attribute_field_writer.wakeups", Unit.WAKEUP, "Number of times a worker thread has been woken up"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_ACCEPTED("content.proton.documentdb.threading_service.index_field_inverter.accepted", Unit.TASK, "Number of accepted tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_QUEUESIZE("content.proton.documentdb.threading_service.index_field_inverter.queuesize", Unit.TASK, "Size of task queue"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_REJECTED("content.proton.documentdb.threading_service.index_field_inverter.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_UTILIZATION("content.proton.documentdb.threading_service.index_field_inverter.utilization", Unit.FRACTION, "Ratio of time the worker threads has been active"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_WAKEUPS("content.proton.documentdb.threading_service.index_field_inverter.wakeups", Unit.WAKEUP, "Number of times a worker thread has been woken up"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_ACCEPTED("content.proton.documentdb.threading_service.index_field_writer.accepted", Unit.TASK, "Number of accepted tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_QUEUESIZE("content.proton.documentdb.threading_service.index_field_writer.queuesize", Unit.TASK, "Size of task queue"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_REJECTED("content.proton.documentdb.threading_service.index_field_writer.rejected", Unit.TASK, "Number of rejected tasks"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_UTILIZATION("content.proton.documentdb.threading_service.index_field_writer.utilization", Unit.FRACTION, "Ratio of time the worker threads has been active"),
+ CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_WAKEUPS("content.proton.documentdb.threading_service.index_field_writer.wakeups", Unit.WAKEUP, "Number of times a worker thread has been woken up"),
+
+ // lid space
+ CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_LID_BLOAT_FACTOR("content.proton.documentdb.ready.lid_space.lid_bloat_factor", Unit.FRACTION, "The bloat factor of this lid space, indicating the total amount of holes in the allocated lid space ((lid_limit - used_lids) / lid_limit)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_LID_FRAGMENTATION_FACTOR("content.proton.documentdb.ready.lid_space.lid_fragmentation_factor", Unit.FRACTION, "The fragmentation factor of this lid space, indicating the amount of holes in the currently used part of the lid space ((highest_used_lid - used_lids) / highest_used_lid)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_LID_LIMIT("content.proton.documentdb.ready.lid_space.lid_limit", Unit.DOCUMENTID, "The size of the allocated lid space"),
+ CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_HIGHEST_USED_LID("content.proton.documentdb.ready.lid_space.highest_used_lid", Unit.DOCUMENTID, "The highest used lid"),
+ CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_USED_LIDS("content.proton.documentdb.ready.lid_space.used_lids", Unit.DOCUMENTID, "The number of lids used"),
+ CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_LOWEST_FREE_LID("content.proton.documentdb.ready.lid_space.lowest_free_lid", Unit.DOCUMENTID, "The lowest free local document id"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_LID_BLOAT_FACTOR("content.proton.documentdb.notready.lid_space.lid_bloat_factor", Unit.FRACTION, "The bloat factor of this lid space, indicating the total amount of holes in the allocated lid space ((lid_limit - used_lids) / lid_limit)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_LID_FRAGMENTATION_FACTOR("content.proton.documentdb.notready.lid_space.lid_fragmentation_factor", Unit.FRACTION, "The fragmentation factor of this lid space, indicating the amount of holes in the currently used part of the lid space ((highest_used_lid - used_lids) / highest_used_lid)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_LID_LIMIT("content.proton.documentdb.notready.lid_space.lid_limit", Unit.DOCUMENTID, "The size of the allocated lid space"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_HIGHEST_USED_LID("content.proton.documentdb.notready.lid_space.highest_used_lid", Unit.DOCUMENTID, "The highest used lid"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_USED_LIDS("content.proton.documentdb.notready.lid_space.used_lids", Unit.DOCUMENTID, "The number of lids used"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_LOWEST_FREE_LID("content.proton.documentdb.notready.lid_space.lowest_free_lid", Unit.DOCUMENTID, "The lowest free local document id"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_LID_BLOAT_FACTOR("content.proton.documentdb.removed.lid_space.lid_bloat_factor", Unit.FRACTION, "The bloat factor of this lid space, indicating the total amount of holes in the allocated lid space ((lid_limit - used_lids) / lid_limit)"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_LID_FRAGMENTATION_FACTOR("content.proton.documentdb.removed.lid_space.lid_fragmentation_factor", Unit.FRACTION, "The fragmentation factor of this lid space, indicating the amount of holes in the currently used part of the lid space ((highest_used_lid - used_lids) / highest_used_lid)"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_LID_LIMIT("content.proton.documentdb.removed.lid_space.lid_limit", Unit.DOCUMENTID, "The size of the allocated lid space"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_HIGHEST_USED_LID("content.proton.documentdb.removed.lid_space.highest_used_lid", Unit.DOCUMENTID, "The highest used lid"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_USED_LIDS("content.proton.documentdb.removed.lid_space.used_lids", Unit.DOCUMENTID, "The number of lids used"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_LOWEST_FREE_LID("content.proton.documentdb.removed.lid_space.lowest_free_lid", Unit.DOCUMENTID, "The lowest free local document id"),
+
+ // bucket move
+ CONTENT_PROTON_DOCUMENTDB_BUCKET_MOVE_BUCKETS_PENDING("content.proton.documentdb.bucket_move.buckets_pending", Unit.BUCKET, "The number of buckets left to move"),
+
+ // resource usage
+ CONTENT_PROTON_RESOURCE_USAGE_DISK("content.proton.resource_usage.disk", Unit.FRACTION, "The relative amount of disk used by this content node (transient usage not included, value in the range [0, 1]). Same value as reported to the cluster controller"),
+ CONTENT_PROTON_RESOURCE_USAGE_DISK_USAGE_TOTAL("content.proton.resource_usage.disk_usage.total", Unit.FRACTION, "The total relative amount of disk used by this content node (value in the range [0, 1])"),
+ CONTENT_PROTON_RESOURCE_USAGE_DISK_USAGE_TOTAL_UTILIZATION("content.proton.resource_usage.disk_usage.total_utilization", Unit.FRACTION, "The relative amount of disk used compared to the content node disk resource limit"),
+ CONTENT_PROTON_RESOURCE_USAGE_DISK_USAGE_TRANSIENT("content.proton.resource_usage.disk_usage.transient", Unit.FRACTION, "The relative amount of transient disk used by this content node (value in the range [0, 1])"),
+ CONTENT_PROTON_RESOURCE_USAGE_MEMORY("content.proton.resource_usage.memory", Unit.FRACTION, "The relative amount of memory used by this content node (transient usage not included, value in the range [0, 1]). Same value as reported to the cluster controller"),
+ CONTENT_PROTON_RESOURCE_USAGE_MEMORY_USAGE_TOTAL("content.proton.resource_usage.memory_usage.total", Unit.FRACTION, "The total relative amount of memory used by this content node (value in the range [0, 1])"),
+ CONTENT_PROTON_RESOURCE_USAGE_MEMORY_USAGE_TOTAL_UTILIZATION("content.proton.resource_usage.memory_usage.total_utilization", Unit.FRACTION, "The relative amount of memory used compared to the content node memory resource limit"),
+ CONTENT_PROTON_RESOURCE_USAGE_MEMORY_USAGE_TRANSIENT("content.proton.resource_usage.memory_usage.transient", Unit.FRACTION, "The relative amount of transient memory used by this content node (value in the range [0, 1])"),
+ CONTENT_PROTON_RESOURCE_USAGE_MEMORY_MAPPINGS("content.proton.resource_usage.memory_mappings", Unit.FILE, "The number of memory mapped files"),
+ CONTENT_PROTON_RESOURCE_USAGE_OPEN_FILE_DESCRIPTORS("content.proton.resource_usage.open_file_descriptors", Unit.FILE, "The number of open files"),
+ CONTENT_PROTON_RESOURCE_USAGE_FEEDING_BLOCKED("content.proton.resource_usage.feeding_blocked", Unit.BINARY, "Whether feeding is blocked due to resource limits being reached (value is either 0 or 1)"),
+ CONTENT_PROTON_RESOURCE_USAGE_MALLOC_ARENA("content.proton.resource_usage.malloc_arena", Unit.BYTE, "Size of malloc arena"),
+ CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_RESOURCE_USAGE_ADDRESS_SPACE("content.proton.documentdb.attribute.resource_usage.address_space", Unit.FRACTION, "The max relative address space used among components in all attribute vectors in this document db (value in the range [0, 1])"),
+ CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_RESOURCE_USAGE_FEEDING_BLOCKED("content.proton.documentdb.attribute.resource_usage.feeding_blocked", Unit.BINARY, "Whether feeding is blocked due to attribute resource limits being reached (value is either 0 or 1)"),
+ CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.attribute.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.attribute.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.attribute.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+ CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.attribute.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+
+ // CPU util
+ CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_SETUP("content.proton.resource_usage.cpu_util.setup", Unit.FRACTION, "cpu used by system init and (re-)configuration"),
+ CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_READ("content.proton.resource_usage.cpu_util.read", Unit.FRACTION, "cpu used by reading data from the system"),
+ CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_WRITE("content.proton.resource_usage.cpu_util.write", Unit.FRACTION, "cpu used by writing data to the system"),
+ CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_COMPACT("content.proton.resource_usage.cpu_util.compact", Unit.FRACTION, "cpu used by internal data re-structuring"),
+ CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_OTHER("content.proton.resource_usage.cpu_util.other", Unit.FRACTION, "cpu used by work not classified as a specific category"),
+
+ // transaction log
+ CONTENT_PROTON_TRANSACTIONLOG_ENTRIES("content.proton.transactionlog.entries", Unit.RECORD, "The current number of entries in the transaction log"),
+ CONTENT_PROTON_TRANSACTIONLOG_DISK_USAGE("content.proton.transactionlog.disk_usage", Unit.BYTE, "The disk usage (in bytes) of the transaction log"),
+ CONTENT_PROTON_TRANSACTIONLOG_REPLAY_TIME("content.proton.transactionlog.replay_time", Unit.SECOND, "The replay time (in seconds) of the transaction log during start-up"),
+
+ // document store
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_DISK_USAGE("content.proton.documentdb.ready.document_store.disk_usage", Unit.BYTE, "Disk space usage in bytes"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_DISK_BLOAT("content.proton.documentdb.ready.document_store.disk_bloat", Unit.BYTE, "Disk space bloat in bytes"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MAX_BUCKET_SPREAD("content.proton.documentdb.ready.document_store.max_bucket_spread", Unit.FRACTION, "Max bucket spread in underlying files (sum(unique buckets in each chunk)/unique buckets in file)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.ready.document_store.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.ready.document_store.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.ready.document_store.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.ready.document_store.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_DISK_USAGE("content.proton.documentdb.notready.document_store.disk_usage", Unit.BYTE, "Disk space usage in bytes"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_DISK_BLOAT("content.proton.documentdb.notready.document_store.disk_bloat", Unit.BYTE, "Disk space bloat in bytes"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MAX_BUCKET_SPREAD("content.proton.documentdb.notready.document_store.max_bucket_spread", Unit.FRACTION, "Max bucket spread in underlying files (sum(unique buckets in each chunk)/unique buckets in file)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.notready.document_store.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.notready.document_store.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.notready.document_store.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.notready.document_store.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_DISK_USAGE("content.proton.documentdb.removed.document_store.disk_usage", Unit.BYTE, "Disk space usage in bytes"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_DISK_BLOAT("content.proton.documentdb.removed.document_store.disk_bloat", Unit.BYTE, "Disk space bloat in bytes"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MAX_BUCKET_SPREAD("content.proton.documentdb.removed.document_store.max_bucket_spread", Unit.FRACTION, "Max bucket spread in underlying files (sum(unique buckets in each chunk)/unique buckets in file)"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.removed.document_store.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.removed.document_store.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.removed.document_store.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.removed.document_store.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+
+ // document store cache
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_ELEMENTS("content.proton.documentdb.ready.document_store.cache.elements", Unit.ITEM, "Number of elements in the cache"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_MEMORY_USAGE("content.proton.documentdb.ready.document_store.cache.memory_usage", Unit.BYTE, "Memory usage of the cache (in bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_HIT_RATE("content.proton.documentdb.ready.document_store.cache.hit_rate", Unit.FRACTION, "Rate of hits in the cache compared to number of lookups"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_LOOKUPS("content.proton.documentdb.ready.document_store.cache.lookups", Unit.OPERATION, "Number of lookups in the cache (hits + misses)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_INVALIDATIONS("content.proton.documentdb.ready.document_store.cache.invalidations", Unit.OPERATION, "Number of invalidations (erased elements) in the cache. "),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_ELEMENTS("content.proton.documentdb.notready.document_store.cache.elements", Unit.ITEM, "Number of elements in the cache"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_MEMORY_USAGE("content.proton.documentdb.notready.document_store.cache.memory_usage", Unit.BYTE, "Memory usage of the cache (in bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_HIT_RATE("content.proton.documentdb.notready.document_store.cache.hit_rate", Unit.FRACTION, "Rate of hits in the cache compared to number of lookups"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_LOOKUPS("content.proton.documentdb.notready.document_store.cache.lookups", Unit.OPERATION, "Number of lookups in the cache (hits + misses)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_INVALIDATIONS("content.proton.documentdb.notready.document_store.cache.invalidations", Unit.OPERATION, "Number of invalidations (erased elements) in the cache. "),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_ELEMENTS("content.proton.documentdb.removed.document_store.cache.elements", Unit.ITEM, "Number of elements in the cache"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_HIT_RATE("content.proton.documentdb.removed.document_store.cache.hit_rate", Unit.FRACTION, "Rate of hits in the cache compared to number of lookups"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_INVALIDATIONS("content.proton.documentdb.removed.document_store.cache.invalidations", Unit.ITEM, "Number of invalidations (erased elements) in the cache. "),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_LOOKUPS("content.proton.documentdb.removed.document_store.cache.lookups", Unit.OPERATION, "Number of lookups in the cache (hits + misses)"),
+ CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_MEMORY_USAGE("content.proton.documentdb.removed.document_store.cache.memory_usage", Unit.BYTE, "Memory usage of the cache (in bytes)"),
+
+ // attribute
+ CONTENT_PROTON_DOCUMENTDB_READY_ATTRIBUTE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.ready.attribute.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ CONTENT_PROTON_DOCUMENTDB_READY_ATTRIBUTE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.ready.attribute.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_ATTRIBUTE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.ready.attribute.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_READY_ATTRIBUTE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.ready.attribute.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_ATTRIBUTE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.notready.attribute.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_ATTRIBUTE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.notready.attribute.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_ATTRIBUTE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.notready.attribute.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_NOTREADY_ATTRIBUTE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.notready.attribute.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+
+ // index
+ CONTENT_PROTON_DOCUMENTDB_INDEX_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.index.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ CONTENT_PROTON_DOCUMENTDB_INDEX_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.index.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_INDEX_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.index.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ CONTENT_PROTON_DOCUMENTDB_INDEX_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.index.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+ CONTENT_PROTON_DOCUMENTDB_INDEX_DISK_USAGE("content.proton.documentdb.index.disk_usage", Unit.BYTE, "Disk space usage in bytes"),
+
+ // matching
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_QUERIES("content.proton.documentdb.matching.queries", Unit.QUERY, "Number of queries executed"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_SOFT_DOOMED_QUERIES("content.proton.documentdb.matching.soft_doomed_queries", Unit.QUERY, "Number of queries hitting the soft timeout"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_QUERY_LATENCY("content.proton.documentdb.matching.query_latency", Unit.SECOND, "Total average latency (sec) when matching and ranking a query"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_QUERY_SETUP_TIME("content.proton.documentdb.matching.query_setup_time", Unit.SECOND, "Average time (sec) spent setting up and tearing down queries"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_DOCS_MATCHED("content.proton.documentdb.matching.docs_matched", Unit.DOCUMENT, "Number of documents matched"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_DOCS_RANKED("content.proton.documentdb.matching.docs_ranked", Unit.DOCUMENT, "Number of documents ranked (first phase)"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_DOCS_RERANKED("content.proton.documentdb.matching.docs_reranked", Unit.DOCUMENT, "Number of documents re-ranked (second phase)"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_QUERIES("content.proton.documentdb.matching.rank_profile.queries", Unit.QUERY, "Number of queries executed"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_SOFT_DOOMED_QUERIES("content.proton.documentdb.matching.rank_profile.soft_doomed_queries", Unit.QUERY, "Number of queries hitting the soft timeout"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_SOFT_DOOM_FACTOR("content.proton.documentdb.matching.rank_profile.soft_doom_factor", Unit.FRACTION, "Factor used to compute soft-timeout"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_QUERY_LATENCY("content.proton.documentdb.matching.rank_profile.query_latency", Unit.SECOND, "Total average latency (sec) when matching and ranking a query"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_QUERY_SETUP_TIME("content.proton.documentdb.matching.rank_profile.query_setup_time", Unit.SECOND, "Average time (sec) spent setting up and tearing down queries"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_GROUPING_TIME("content.proton.documentdb.matching.rank_profile.grouping_time", Unit.SECOND, "Average time (sec) spent on grouping"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_RERANK_TIME("content.proton.documentdb.matching.rank_profile.rerank_time", Unit.SECOND, "Average time (sec) spent on 2nd phase ranking"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCS_MATCHED("content.proton.documentdb.matching.rank_profile.docs_matched", Unit.DOCUMENT, "Number of documents matched"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCS_RANKED("content.proton.documentdb.matching.rank_profile.docs_ranked", Unit.DOCUMENT, "Number of documents ranked (first phase)"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCS_RERANKED("content.proton.documentdb.matching.rank_profile.docs_reranked", Unit.DOCUMENT, "Number of documents re-ranked (second phase)"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_LIMITED_QUERIES("content.proton.documentdb.matching.rank_profile.limited_queries", Unit.QUERY, "Number of queries limited in match phase"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_ACTIVE_TIME("content.proton.documentdb.matching.rank_profile.docid_partition.active_time", Unit.SECOND, "Time (sec) spent doing actual work"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_DOCS_MATCHED("content.proton.documentdb.matching.rank_profile.docid_partition.docs_matched", Unit.DOCUMENT, "Number of documents matched"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_DOCS_RANKED("content.proton.documentdb.matching.rank_profile.docid_partition.docs_ranked", Unit.DOCUMENT, "Number of documents ranked (first phase)"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_DOCS_RERANKED("content.proton.documentdb.matching.rank_profile.docid_partition.docs_reranked", Unit.DOCUMENT, "Number of documents re-ranked (second phase)"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_WAIT_TIME("content.proton.documentdb.matching.rank_profile.docid_partition.wait_time", Unit.SECOND, "Time (sec) spent waiting for other external threads and resources"),
+ CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_MATCH_TIME("content.proton.documentdb.matching.rank_profile.match_time", Unit.SECOND, "Average time (sec) for matching a query (1st phase)"),
+
+ // feeding
+ CONTENT_PROTON_DOCUMENTDB_FEEDING_COMMIT_OPERATIONS("content.proton.documentdb.feeding.commit.operations", Unit.OPERATION, "Number of operations included in a commit"),
+ CONTENT_PROTON_DOCUMENTDB_FEEDING_COMMIT_LATENCY("content.proton.documentdb.feeding.commit.latency", Unit.SECOND, "Latency for commit in seconds"),
+
+
+ // Metrics emitters not used in any metrics sets
+ CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_CACHED("content.proton.session_cache.grouping.num_cached", Unit.SESSION, "Number of currently cached sessions"),
+ CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_DROPPED("content.proton.session_cache.grouping.num_dropped", Unit.SESSION, "Number of dropped cached sessions"),
+ CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_INSERT("content.proton.session_cache.grouping.num_insert", Unit.SESSION, "Number of inserted sessions"),
+ CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_PICK("content.proton.session_cache.grouping.num_pick", Unit.SESSION, "Number if picked sessions"),
+ CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_TIMEDOUT("content.proton.session_cache.grouping.num_timedout", Unit.SESSION, "Number of timed out sessions"),
+ CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_CACHED("content.proton.session_cache.search.num_cached", Unit.SESSION, "Number of currently cached sessions"),
+ CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_DROPPED("content.proton.session_cache.search.num_dropped", Unit.SESSION, "Number of dropped cached sessions"),
+ CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_INSERT("content.proton.session_cache.search.num_insert", Unit.SESSION, "Number of inserted sessions"),
+ CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_PICK("content.proton.session_cache.search.num_pick", Unit.SESSION, "Number if picked sessions"),
+ CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_TIMEDOUT("content.proton.session_cache.search.num_timedout", Unit.SESSION, "Number of timed out sessions"),
+
+ METRICMANAGER_PERIODICHOOKLATENCY("metricmanager.periodichooklatency", Unit.MILLISECOND, "Time in ms used to update a single periodic hook"),
+ METRICMANAGER_RESETLATENCY("metricmanager.resetlatency", Unit.MILLISECOND, "Time in ms used to reset all metrics."),
+ METRICMANAGER_SLEEPTIME("metricmanager.sleeptime", Unit.MILLISECOND, "Time in ms worker thread is sleeping"),
+ METRICMANAGER_SNAPSHOTHOOKLATENCY("metricmanager.snapshothooklatency", Unit.MILLISECOND, "Time in ms used to update a single snapshot hook"),
+ METRICMANAGER_SNAPSHOTLATENCY("metricmanager.snapshotlatency", Unit.MILLISECOND, "Time in ms used to take a snapshot");
+
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ SearchNodeMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/SentinelMetrics.java b/metrics/src/main/java/ai/vespa/metrics/SentinelMetrics.java
new file mode 100644
index 00000000000..35ecbae85d8
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/SentinelMetrics.java
@@ -0,0 +1,36 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngve
+ */
+public enum SentinelMetrics implements VespaMetrics {
+
+ SENTINEL_RESTARTS("sentinel.restarts", Unit.RESTART, "Number of service restarts done by the sentinel"),
+ SENTINEL_TOTAL_RESTARTS("sentinel.totalRestarts", Unit.RESTART, "Total number of service restarts done by the sentinel since the sentinel was started"),
+ SENTINEL_UPTIME("sentinel.uptime", Unit.SECOND, "Time the sentinel has been running"),
+ SENTINEL_RUNNING("sentinel.running", Unit.INSTANCE, "Number of services the sentinel has running currently");
+
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ SentinelMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/SlobrokMetrics.java b/metrics/src/main/java/ai/vespa/metrics/SlobrokMetrics.java
new file mode 100644
index 00000000000..1a6735af860
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/SlobrokMetrics.java
@@ -0,0 +1,37 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngve
+ */
+public enum SlobrokMetrics implements VespaMetrics {
+
+ SLOBROK_HEARTBEATS_FAILED("slobrok.heartbeats.failed", Unit.REQUEST, "Number of heartbeat requests failed"),
+ SLOBROK_REQUESTS_REGISTER("slobrok.requests.register", Unit.REQUEST, "Number of register requests received"),
+ SLOBROK_REQUESTS_MIRROR("slobrok.requests.mirror", Unit.REQUEST, "Number of mirroring requests received"),
+ SLOBROK_REQUESTS_ADMIN("slobrok.requests.admin", Unit.REQUEST, "Number of administrative requests received"),
+ SLOBROK_MISSING_CONSENSUS("slobrok.missing.consensus", Unit.SECOND, "Number of seconds without full consensus with all other brokers");
+
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ SlobrokMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java b/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java
new file mode 100644
index 00000000000..7071fe0ae77
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java
@@ -0,0 +1,253 @@
+package ai.vespa.metrics;
+
+/**
+ * @author yngveaasheim
+ */
+public enum StorageMetrics implements VespaMetrics {
+
+ VDS_DATASTORED_ALLDISKS_BUCKETS("vds.datastored.alldisks.buckets", Unit.BUCKET, "Number of buckets managed"),
+ VDS_DATASTORED_ALLDISKS_DOCS("vds.datastored.alldisks.docs", Unit.DOCUMENT, "Number of documents stored"),
+ VDS_DATASTORED_ALLDISKS_BYTES("vds.datastored.alldisks.bytes", Unit.BYTE, "Number of bytes stored"),
+ VDS_DATASTORED_ALLDISKS_ACTIVEBUCKETS("vds.datastored.alldisks.activebuckets", Unit.BUCKET, "Number of active buckets on the node"),
+ VDS_DATASTORED_ALLDISKS_READYBUCKETS("vds.datastored.alldisks.readybuckets", Unit.BUCKET, "Number of ready buckets on the node"),
+
+ VDS_VISITOR_ALLTHREADS_AVERAGEVISITORLIFETIME("vds.visitor.allthreads.averagevisitorlifetime", Unit.MILLISECOND, "Average lifetime of a visitor"),
+ VDS_VISITOR_ALLTHREADS_AVERAGEQUEUEWAIT("vds.visitor.allthreads.averagequeuewait", Unit.MILLISECOND, "Average time an operation spends in input queue."),
+ VDS_VISITOR_ALLTHREADS_QUEUESIZE("vds.visitor.allthreads.queuesize", Unit.OPERATION, "Size of input message queue."),
+ VDS_VISITOR_ALLTHREADS_COMPLETED("vds.visitor.allthreads.completed", Unit.OPERATION, "Number of visitors completed"),
+ VDS_VISITOR_ALLTHREADS_CREATED("vds.visitor.allthreads.created", Unit.OPERATION, "Number of visitors created."),
+ VDS_VISITOR_ALLTHREADS_FAILED("vds.visitor.allthreads.failed", Unit.OPERATION, "Number of visitors failed"),
+ VDS_VISITOR_ALLTHREADS_AVERAGEMESSAGESENDTIME("vds.visitor.allthreads.averagemessagesendtime", Unit.MILLISECOND, "Average time it takes for messages to be sent to their target (and be replied to)"),
+ VDS_VISITOR_ALLTHREADS_AVERAGEPROCESSINGTIME("vds.visitor.allthreads.averageprocessingtime", Unit.MILLISECOND, "Average time used to process visitor requests"),
+ VDS_VISITOR_ALLTHREADS_ABORTED("vds.visitor.allthreads.aborted", Unit.INSTANCE, "Number of visitors aborted."),
+ VDS_VISITOR_ALLTHREADS_AVERAGEVISITORCREATIONTIME("vds.visitor.allthreads.averagevisitorcreationtime", Unit.MILLISECOND, "Average time spent creating a visitor instance"),
+ VDS_VISITOR_ALLTHREADS_DESTINATION_FAILURE_REPLIES("vds.visitor.allthreads.destination_failure_replies", Unit.INSTANCE, "Number of failure replies received from the visitor destination"),
+
+ VDS_FILESTOR_QUEUESIZE("vds.filestor.queuesize", Unit.OPERATION, "Size of input message queue."),
+ VDS_FILESTOR_AVERAGEQUEUEWAIT("vds.filestor.averagequeuewait", Unit.MILLISECOND, "Average time an operation spends in input queue."),
+ VDS_FILESTOR_ACTIVE_OPERATIONS_SIZE("vds.filestor.active_operations.size", Unit.OPERATION, "Number of concurrent active operations"),
+ VDS_FILESTOR_ACTIVE_OPERATIONS_LATENCY("vds.filestor.active_operations.latency", Unit.MILLISECOND, "Latency (in ms) for completed operations"), // TODO Vespa 9: Remove 'active' from the metric name
+ VDS_FILESTOR_THROTTLE_WINDOW_SIZE("vds.filestor.throttle_window_size", Unit.OPERATION, "Current size of async operation throttler window size"),
+ VDS_FILESTOR_THROTTLE_WAITING_THREADS("vds.filestor.throttle_waiting_threads", Unit.THREAD, "Number of threads waiting to acquire a throttle token"),
+ VDS_FILESTOR_THROTTLE_ACTIVE_TOKENS("vds.filestor.throttle_active_tokens", Unit.INSTANCE, "Current number of active throttle tokens"),
+ VDS_FILESTOR_ALLTHREADS_MERGEMETADATAREADLATENCY("vds.filestor.allthreads.mergemetadatareadlatency", Unit.MILLISECOND, "Time spent in a merge step to check metadata of current node to see what data it has."),
+ VDS_FILESTOR_ALLTHREADS_MERGEDATAREADLATENCY("vds.filestor.allthreads.mergedatareadlatency", Unit.MILLISECOND, "Time spent in a merge step to read data other nodes need."),
+ VDS_FILESTOR_ALLTHREADS_MERGEDATAWRITELATENCY("vds.filestor.allthreads.mergedatawritelatency", Unit.MILLISECOND, "Time spent in a merge step to write data needed to current node."),
+ VDS_FILESTOR_ALLTHREADS_MERGEAVGDATARECEIVEDNEEDED("vds.filestor.allthreads.mergeavgdatareceivedneeded", Unit.BYTE, "Amount of data transferred from previous node in chain that we needed to apply locally."),
+ VDS_FILESTOR_ALLTHREADS_MERGEBUCKETS_COUNT("vds.filestor.allthreads.mergebuckets.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_MERGEBUCKETS_FAILED("vds.filestor.allthreads.mergebuckets.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_MERGEBUCKETS_LATENCY("vds.filestor.allthreads.mergebuckets.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_MERGELATENCYTOTAL("vds.filestor.allthreads.mergelatencytotal", Unit.MILLISECOND, "Latency of total merge operation, from master node receives it, until merge is complete and master node replies."),
+ VDS_FILESTOR_ALLTHREADS_MERGE_PUT_LATENCY("vds.filestor.allthreads.put_latency", Unit.MILLISECOND, "Latency of individual puts that are part of merge operations"), // TODO Vespa 9: Update metric name to include 'merge'
+ VDS_FILESTOR_ALLTHREADS_MERGE_REMOVE_LATENCY("vds.filestor.allthreads.remove_latency", Unit.MILLISECOND, "Latency of individual removes that are part of merge operations"), // TODO Vespa 9: Update metric name to include 'merge'
+ VDS_FILESTOR_ALLSTRIPES_THROTTLED_RPC_DIRECT_DISPATCHES("vds.filestor.allstripes.throttled_rpc_direct_dispatches", Unit.INSTANCE, "Number of times an RPC thread could not directly dispatch an async operation directly to Proton because it was disallowed by the throttle policy"),
+ VDS_FILESTOR_ALLSTRIPES_THROTTLED_PERSISTENCE_THREAD_POLLS("vds.filestor.allstripes.throttled_persistence_thread_polls", Unit.INSTANCE, "Number of times a persistence thread could not immediately dispatch a queued async operation because it was disallowed by the throttle policy"),
+ VDS_FILESTOR_ALLSTRIPES_TIMEOUTS_WAITING_FOR_THROTTLE_TOKEN("vds.filestor.allstripes.timeouts_waiting_for_throttle_token", Unit.INSTANCE, "Number of times a persistence thread timed out waiting for an available throttle policy token"),
+ VDS_FILESTOR_ALLSTRIPES_AVERAGEQUEUEWAIT("vds.filestor.allstripes.averagequeuewait", Unit.MILLISECOND, "Average time an operation spends in input queue."),
+
+ VDS_FILESTOR_ALLTHREADS_PUT_COUNT("vds.filestor.allthreads.put.count", Unit.OPERATION, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_PUT_FAILED("vds.filestor.allthreads.put.failed", Unit.OPERATION, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_PUT_TEST_AND_SET_FAILED("vds.filestor.allthreads.put.test_and_set_failed", Unit.OPERATION, "Number of operations that were skipped due to a test-and-set condition not met"),
+ VDS_FILESTOR_ALLTHREADS_PUT_LATENCY("vds.filestor.allthreads.put.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_PUT_REQUEST_SIZE("vds.filestor.allthreads.put.request_size", Unit.BYTE, "Size of requests, in bytes"),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_COUNT("vds.filestor.allthreads.remove.count", Unit.OPERATION, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_FAILED("vds.filestor.allthreads.remove.failed", Unit.OPERATION, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_TEST_AND_SET_FAILED("vds.filestor.allthreads.remove.test_and_set_failed", Unit.OPERATION, "Number of operations that were skipped due to a test-and-set condition not met"),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_LATENCY("vds.filestor.allthreads.remove.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_REQUEST_SIZE("vds.filestor.allthreads.remove.request_size", Unit.BYTE, "Size of requests, in bytes"),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_NOT_FOUND("vds.filestor.allthreads.remove.not_found", Unit.REQUEST, "Number of requests that could not be completed due to source document not found."),
+
+ VDS_FILESTOR_ALLTHREADS_GET_COUNT("vds.filestor.allthreads.get.count", Unit.OPERATION, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_GET_FAILED("vds.filestor.allthreads.get.failed", Unit.OPERATION, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_GET_LATENCY("vds.filestor.allthreads.get.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_GET_REQUEST_SIZE("vds.filestor.allthreads.get.request_size", Unit.BYTE, "Size of requests, in bytes"),
+ VDS_FILESTOR_ALLTHREADS_GET_NOT_FOUND("vds.filestor.allthreads.get.not_found", Unit.REQUEST, "Number of requests that could not be completed due to source document not found."),
+ VDS_FILESTOR_ALLTHREADS_UPDATE_COUNT("vds.filestor.allthreads.update.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_UPDATE_FAILED("vds.filestor.allthreads.update.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_UPDATE_TEST_AND_SET_FAILED("vds.filestor.allthreads.update.test_and_set_failed", Unit.REQUEST, "Number of requests that were skipped due to a test-and-set condition not met"),
+ VDS_FILESTOR_ALLTHREADS_UPDATE_LATENCY("vds.filestor.allthreads.update.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_UPDATE_REQUEST_SIZE("vds.filestor.allthreads.update.request_size", Unit.BYTE, "Size of requests, in bytes"),
+ VDS_FILESTOR_ALLTHREADS_UPDATE_LATENCY_READ("vds.filestor.allthreads.update.latency_read", Unit.MILLISECOND, "Latency of the source read in the request."),
+ VDS_FILESTOR_ALLTHREADS_UPDATE_NOT_FOUND("vds.filestor.allthreads.update.not_found", Unit.REQUEST, "Number of requests that could not be completed due to source document not found."),
+ VDS_FILESTOR_ALLTHREADS_CREATEITERATOR_COUNT("vds.filestor.allthreads.createiterator.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_CREATEITERATOR_LATENCY("vds.filestor.allthreads.createiterator.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_CREATEITERATOR_FAILED("vds.filestor.allthreads.createiterator.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_VISIT_COUNT("vds.filestor.allthreads.visit.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_VISIT_LATENCY("vds.filestor.allthreads.visit.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_VISIT_DOCS("vds.filestor.allthreads.visit.docs", Unit.DOCUMENT, "Number of entries read per iterate call"),
+ VDS_FILESTOR_ALLTHREADS_VISIT_FAILED("vds.filestor.allthreads.visit.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_LOCATION_COUNT("vds.filestor.allthreads.remove_location.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_LOCATION_LATENCY("vds.filestor.allthreads.remove_location.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_REMOVE_LOCATION_FAILED("vds.filestor.allthreads.remove_location.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_SPLITBUCKETS_COUNT("vds.filestor.allthreads.splitbuckets.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_SPLITBUCKETS_FAILED("vds.filestor.allthreads.splitbuckets.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_SPLITBUCKETS_LATENCY("vds.filestor.allthreads.splitbuckets.latency", Unit.REQUEST, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_JOINBUCKETS_COUNT("vds.filestor.allthreads.joinbuckets.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_JOINBUCKETS_FAILED("vds.filestor.allthreads.joinbuckets.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_JOINBUCKETS_LATENCY("vds.filestor.allthreads.joinbuckets.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_DELETEBUCKETS_COUNT("vds.filestor.allthreads.deletebuckets.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_DELETEBUCKETS_FAILED("vds.filestor.allthreads.deletebuckets.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_DELETEBUCKETS_LATENCY("vds.filestor.allthreads.deletebuckets.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_SETBUCKETSTATES_COUNT("vds.filestor.allthreads.setbucketstates.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_SETBUCKETSTATES_FAILED("vds.filestor.allthreads.setbucketstates.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_SETBUCKETSTATES_LATENCY("vds.filestor.allthreads.setbucketstates.latency", Unit.MILLISECOND, "Latency of successful requests."),
+
+ VDS_MERGETHROTTLER_AVERAGEQUEUEWAITINGTIME("vds.mergethrottler.averagequeuewaitingtime", Unit.MILLISECOND, "Time merges spent in the throttler queue"),
+ VDS_MERGETHROTTLER_QUEUESIZE("vds.mergethrottler.queuesize", Unit.INSTANCE, "Length of merge queue"),
+ VDS_MERGETHROTTLER_ACTIVE_WINDOW_SIZE("vds.mergethrottler.active_window_size", Unit.INSTANCE, "Number of merges active within the pending window size"),
+ VDS_MERGETHROTTLER_BOUNCED_DUE_TO_BACK_PRESSURE("vds.mergethrottler.bounced_due_to_back_pressure", Unit.INSTANCE, "Number of merges bounced due to resource exhaustion back-pressure"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_OK("vds.mergethrottler.locallyexecutedmerges.ok", Unit.INSTANCE, "The number of successful merges for 'locallyexecutedmerges'"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_ABORTED("vds.mergethrottler.locallyexecutedmerges.failures.aborted", Unit.OPERATION, "The number of merges that failed because the storage node was (most likely) shutting down"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_BUCKETNOTFOUND("vds.mergethrottler.locallyexecutedmerges.failures.bucketnotfound", Unit.OPERATION, "The number of operations that failed because the bucket did not exist"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_BUSY("vds.mergethrottler.locallyexecutedmerges.failures.busy", Unit.OPERATION, "The number of merges that failed because the storage node was busy"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_EXISTS("vds.mergethrottler.locallyexecutedmerges.failures.exists", Unit.OPERATION, "The number of merges that were rejected due to a merge operation for their bucket already being processed"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_NOTREADY("vds.mergethrottler.locallyexecutedmerges.failures.notready", Unit.OPERATION, "The number of merges discarded because distributor was not ready"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_OTHER("vds.mergethrottler.locallyexecutedmerges.failures.other", Unit.OPERATION, "The number of other failures"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_REJECTED("vds.mergethrottler.locallyexecutedmerges.failures.rejected", Unit.OPERATION, "The number of merges that were rejected"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_TIMEOUT("vds.mergethrottler.locallyexecutedmerges.failures.timeout", Unit.OPERATION, "The number of merges that failed because they timed out towards storage"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_TOTAL("vds.mergethrottler.locallyexecutedmerges.failures.total", Unit.OPERATION, "Sum of all failures"),
+ VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_WRONGDISTRIBUTION("vds.mergethrottler.locallyexecutedmerges.failures.wrongdistribution", Unit.OPERATION, "The number of merges that were discarded (flushed) because they were initiated at an older cluster state than the current"),
+ VDS_MERGETHROTTLER_MERGECHAINS_OK("vds.mergethrottler.mergechains.ok", Unit.OPERATION, "The number of successful merges for 'mergechains'"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_BUSY("vds.mergethrottler.mergechains.failures.busy", Unit.OPERATION, "The number of merges that failed because the storage node was busy"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_TOTAL("vds.mergethrottler.mergechains.failures.total", Unit.OPERATION, "Sum of all failures"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_EXISTS("vds.mergethrottler.mergechains.failures.exists", Unit.OPERATION, "The number of merges that were rejected due to a merge operation for their bucket already being processed"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_NOTREADY("vds.mergethrottler.mergechains.failures.notready", Unit.OPERATION, "The number of merges discarded because distributor was not ready"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_OTHER("vds.mergethrottler.mergechains.failures.other", Unit.OPERATION, "The number of other failures"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_REJECTED("vds.mergethrottler.mergechains.failures.rejected", Unit.OPERATION, "The number of merges that were rejected"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_TIMEOUT("vds.mergethrottler.mergechains.failures.timeout", Unit.OPERATION, "The number of merges that failed because they timed out towards storage"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_WRONGDISTRIBUTION("vds.mergethrottler.mergechains.failures.wrongdistribution", Unit.OPERATION, "The number of merges that were discarded (flushed) because they were initiated at an older cluster state than the current"),
+
+
+ // C++ TLS metrics - these come from both the distributor and storage
+ VDS_SERVER_NETWORK_TLS_HANDSHAKES_FAILED("vds.server.network.tls-handshakes-failed", Unit.OPERATION, "Number of client or server connection attempts that failed during TLS handshaking"),
+ VDS_SERVER_NETWORK_PEER_AUTHORIZATION_FAILURES("vds.server.network.peer-authorization-failures", Unit.FAILURE, "Number of TLS connection attempts failed due to bad or missing peer certificate credentials"),
+ VDS_SERVER_NETWORK_CLIENT_TLS_CONNECTIONS_ESTABLISHED("vds.server.network.client.tls-connections-established", Unit.CONNECTION, "Number of secure mTLS connections established"),
+ VDS_SERVER_NETWORK_SERVER_TLS_CONNECTIONS_ESTABLISHED("vds.server.network.server.tls-connections-established", Unit.CONNECTION, "Number of secure mTLS connections established"),
+ VDS_SERVER_NETWORK_CLIENT_INSECURE_CONNECTIONS_ESTABLISHED("vds.server.network.client.insecure-connections-established", Unit.CONNECTION, "Number of insecure (plaintext) connections established"),
+ VDS_SERVER_NETWORK_SERVER_INSECURE_CONNECTIONS_ESTABLISHED("vds.server.network.server.insecure-connections-established", Unit.CONNECTION, "Number of insecure (plaintext) connections established"),
+ VDS_SERVER_NETWORK_TLS_CONNECTIONS_BROKEN("vds.server.network.tls-connections-broken", Unit.CONNECTION, "Number of TLS connections broken due to failures during frame encoding or decoding"),
+ VDS_SERVER_NETWORK_FAILED_TLS_CONFIG_RELOADS("vds.server.network.failed-tls-config-reloads", Unit.FAILURE, "Number of times background reloading of TLS config has failed"),
+
+ VDS_BOUNCER_UNAVAILABLE_NODE_ABORTS("vds.bouncer.unavailable_node_aborts", Unit.OPERATION, "Number of operations that were aborted due to the node (or target bucket space) being unavailable"),
+ VDS_CHANGEDBUCKETOWNERSHIPHANDLER_AVG_ABORT_PROCESSING_TIME("vds.changedbucketownershiphandler.avg_abort_processing_time", Unit.MILLISECOND, "Average time spent aborting operations for changed buckets"),
+ VDS_CHANGEDBUCKETOWNERSHIPHANDLER_EXTERNAL_LOAD_OPS_ABORTED("vds.changedbucketownershiphandler.external_load_ops_aborted", Unit.OPERATION, "Number of outdated external load operations aborted"),
+ VDS_CHANGEDBUCKETOWNERSHIPHANDLER_IDEAL_STATE_OPS_ABORTED("vds.changedbucketownershiphandler.ideal_state_ops_aborted", Unit.OPERATION, "Number of outdated ideal state operations aborted"),
+ VDS_COMMUNICATION_BUCKET_SPACE_MAPPING_FAILURES("vds.communication.bucket_space_mapping_failures", Unit.OPERATION, "Number of messages that could not be resolved to a known bucket space"),
+ VDS_COMMUNICATION_CONVERTFAILURES("vds.communication.convertfailures", Unit.OPERATION, "Number of messages that failed to get converted to storage API messages"),
+ VDS_COMMUNICATION_EXCEPTIONMESSAGEPROCESSTIME("vds.communication.exceptionmessageprocesstime", Unit.MILLISECOND, "Time transport thread uses to process a single message that fails with an exception thrown into communication manager"),
+ VDS_COMMUNICATION_MESSAGEPROCESSTIME("vds.communication.messageprocesstime", Unit.MILLISECOND, "Time transport thread uses to process a single message"),
+ VDS_COMMUNICATION_MESSAGEQUEUE("vds.communication.messagequeue", Unit.ITEM, "Size of input message queue."),
+ VDS_COMMUNICATION_SENDCOMMANDLATENCY("vds.communication.sendcommandlatency", Unit.MILLISECOND, "Average ms used to send commands to MBUS"),
+ VDS_COMMUNICATION_SENDREPLYLATENCY("vds.communication.sendreplylatency", Unit.MILLISECOND, "Average ms used to send replies to MBUS"),
+ VDS_COMMUNICATION_TOOLITTLEMEMORY("vds.communication.toolittlememory", Unit.OPERATION, "Number of messages failed due to too little memory available"),
+
+ VDS_DATASTORED_BUCKET_SPACE_ACTIVE_BUCKETS("vds.datastored.bucket_space.active_buckets", Unit.BUCKET, "Number of active buckets in the bucket space"),
+ VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_ALLOCATED_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"),
+ VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_DEAD_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"),
+ VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_ONHOLD_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
+ VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_USED_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
+ VDS_DATASTORED_BUCKET_SPACE_BUCKETS_TOTAL("vds.datastored.bucket_space.buckets_total", Unit.BUCKET, "Total number buckets present in the bucket space (ready + not ready)"),
+ VDS_DATASTORED_BUCKET_SPACE_BYTES("vds.datastored.bucket_space.bytes", Unit.BYTE, "Bytes stored across all documents in the bucket space"),
+ VDS_DATASTORED_BUCKET_SPACE_DOCS("vds.datastored.bucket_space.docs", Unit.DOCUMENT, "Documents stored in the bucket space"),
+ VDS_DATASTORED_BUCKET_SPACE_READY_BUCKETS("vds.datastored.bucket_space.ready_buckets", Unit.BUCKET, "Number of ready buckets in the bucket space"),
+ VDS_DATASTORED_FULLBUCKETINFOLATENCY("vds.datastored.fullbucketinfolatency", Unit.MILLISECOND, "Amount of time spent to process a full bucket info request"),
+ VDS_DATASTORED_FULLBUCKETINFOREQSIZE("vds.datastored.fullbucketinforeqsize", Unit.NODE, "Amount of distributors answered at once in full bucket info requests."),
+ VDS_DATASTORED_SIMPLEBUCKETINFOREQSIZE("vds.datastored.simplebucketinforeqsize", Unit.BUCKET, "Amount of buckets returned in simple bucket info requests"),
+
+ VDS_FILESTOR_ALLTHREADS_APPLYBUCKETDIFF_COUNT("vds.filestor.allthreads.applybucketdiff.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_APPLYBUCKETDIFF_FAILED("vds.filestor.allthreads.applybucketdiff.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_APPLYBUCKETDIFF_LATENCY("vds.filestor.allthreads.applybucketdiff.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_APPLYBUCKETDIFFREPLY("vds.filestor.allthreads.applybucketdiffreply", Unit.REQUEST, "Number of applybucketdiff replies that have been processed."),
+ VDS_FILESTOR_ALLTHREADS_BUCKETFIXED("vds.filestor.allthreads.bucketfixed", Unit.BUCKET, "Number of times bucket has been fixed because of corruption"),
+ VDS_FILESTOR_ALLTHREADS_BUCKETVERIFIED_COUNT("vds.filestor.allthreads.bucketverified.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_BUCKETVERIFIED_FAILED("vds.filestor.allthreads.bucketverified.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_BUCKETVERIFIED_LATENCY("vds.filestor.allthreads.bucketverified.latency", Unit.REQUEST, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_BYTESMERGED("vds.filestor.allthreads.bytesmerged", Unit.BYTE, "Total number of bytes merged into this node."),
+ VDS_FILESTOR_ALLTHREADS_CREATEBUCKETS_COUNT("vds.filestor.allthreads.createbuckets.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_CREATEBUCKETS_FAILED("vds.filestor.allthreads.createbuckets.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_CREATEBUCKETS_LATENCY("vds.filestor.allthreads.createbuckets.latency", Unit.REQUEST, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_FAILEDOPERATIONS("vds.filestor.allthreads.failedoperations", Unit.OPERATION, "Number of operations throwing exceptions."),
+ VDS_FILESTOR_ALLTHREADS_GETBUCKETDIFF_COUNT("vds.filestor.allthreads.getbucketdiff.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_GETBUCKETDIFF_FAILED("vds.filestor.allthreads.getbucketdiff.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_GETBUCKETDIFF_LATENCY("vds.filestor.allthreads.getbucketdiff.latency", Unit.REQUEST, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_GETBUCKETDIFFREPLY("vds.filestor.allthreads.getbucketdiffreply", Unit.REQUEST, "Number of getbucketdiff replies that have been processed."),
+ VDS_FILESTOR_ALLTHREADS_INTERNALJOIN_COUNT("vds.filestor.allthreads.internaljoin.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_INTERNALJOIN_FAILED("vds.filestor.allthreads.internaljoin.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_INTERNALJOIN_LATENCY("vds.filestor.allthreads.internaljoin.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_MOVEDBUCKETS_COUNT("vds.filestor.allthreads.movedbuckets.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_MOVEDBUCKETS_FAILED("vds.filestor.allthreads.movedbuckets.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_MOVEDBUCKETS_LATENCY("vds.filestor.allthreads.movedbuckets.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_OPERATIONS("vds.filestor.allthreads.operations", Unit.OPERATION, "Number of operations processed."),
+
+ VDS_FILESTOR_ALLTHREADS_READBUCKETINFO_COUNT("vds.filestor.allthreads.readbucketinfo.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_READBUCKETINFO_FAILED("vds.filestor.allthreads.readbucketinfo.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_READBUCKETINFO_LATENCY("vds.filestor.allthreads.readbucketinfo.latency", Unit.REQUEST, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_READBUCKETLIST_COUNT("vds.filestor.allthreads.readbucketlist.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_READBUCKETLIST_FAILED("vds.filestor.allthreads.readbucketlist.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_READBUCKETLIST_LATENCY("vds.filestor.allthreads.readbucketlist.latency", Unit.MILLISECOND, "Latency of successful requests."),
+
+ VDS_FILESTOR_ALLTHREADS_RECHECKBUCKETINFO_COUNT("vds.filestor.allthreads.recheckbucketinfo.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_RECHECKBUCKETINFO_FAILED("vds.filestor.allthreads.recheckbucketinfo.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_RECHECKBUCKETINFO_LATENCY("vds.filestor.allthreads.recheckbucketinfo.latency", Unit.MILLISECOND, "Latency of successful requests."),
+
+ VDS_FILESTOR_ALLTHREADS_REVERT_COUNT("vds.filestor.allthreads.revert.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_REVERT_FAILED("vds.filestor.allthreads.revert.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_REVERT_LATENCY("vds.filestor.allthreads.revert.latency", Unit.MILLISECOND, "Latency of successful requests."),
+ VDS_FILESTOR_ALLTHREADS_REVERT_NOT_FOUND("vds.filestor.allthreads.revert.not_found", Unit.REQUEST, "Number of requests that could not be completed due to source document not found."),
+ VDS_FILESTOR_ALLTHREADS_STAT_BUCKET_COUNT("vds.filestor.allthreads.stat_bucket.count", Unit.REQUEST, "Number of requests processed."),
+ VDS_FILESTOR_ALLTHREADS_STAT_BUCKET_FAILED("vds.filestor.allthreads.stat_bucket.failed", Unit.REQUEST, "Number of failed requests."),
+ VDS_FILESTOR_ALLTHREADS_STAT_BUCKET_LATENCY("vds.filestor.allthreads.stat_bucket.latency", Unit.REQUEST, "Latency of successful requests."),
+ VDS_FILESTOR_BUCKET_DB_INIT_LATENCY("vds.filestor.bucket_db_init_latency", Unit.MILLISECOND, "Time taken (in ms) to initialize bucket databases with information from the persistence provider"),
+ VDS_FILESTOR_DIRECTORYEVENTS("vds.filestor.directoryevents", Unit.OPERATION, "Number of directory events received."),
+ VDS_FILESTOR_DISKEVENTS("vds.filestor.diskevents", Unit.OPERATION, "Number of disk events received."),
+ VDS_FILESTOR_PARTITIONEVENTS("vds.filestor.partitionevents", Unit.OPERATION, "Number of partition events received."),
+ VDS_FILESTOR_PENDINGMERGE("vds.filestor.pendingmerge", Unit.BUCKET, "Number of buckets currently being merged."),
+ VDS_FILESTOR_WAITINGFORLOCKRATE("vds.filestor.waitingforlockrate", Unit.OPERATION, "Amount of times a filestor thread has needed to wait for lock to take next message in queue."),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_ABORTED("vds.mergethrottler.mergechains.failures.aborted", Unit.OPERATION, "The number of merges that failed because the storage node was (most likely) shutting down"),
+ VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_BUCKETNOTFOUND("vds.mergethrottler.mergechains.failures.bucketnotfound", Unit.OPERATION, "The number of operations that failed because the bucket did not exist"),
+ VDS_SERVER_MEMORYUSAGE("vds.server.memoryusage", Unit.BYTE, "Amount of memory used by the storage subsystem"),
+ VDS_SERVER_MEMORYUSAGE_VISITING("vds.server.memoryusage_visiting", Unit.BYTE, "Message use from visiting"),
+ VDS_SERVER_MESSAGE_MEMORY_USE_HIGHPRI("vds.server.message_memory_use.highpri", Unit.BYTE, "Message use from high priority storage messages"),
+ VDS_SERVER_MESSAGE_MEMORY_USE_LOWPRI("vds.server.message_memory_use.lowpri", Unit.BYTE, "Message use from low priority storage messages"),
+ VDS_SERVER_MESSAGE_MEMORY_USE_NORMALPRI("vds.server.message_memory_use.normalpri", Unit.BYTE, "Message use from normal priority storage messages"),
+ VDS_SERVER_MESSAGE_MEMORY_USE_TOTAL("vds.server.message_memory_use.total", Unit.BYTE, "Message use from storage messages"),
+ VDS_SERVER_MESSAGE_MEMORY_USE_VERYHIGHPRI("vds.server.message_memory_use.veryhighpri", Unit.BYTE, "Message use from very high priority storage messages"),
+ VDS_STATE_MANAGER_INVOKE_STATE_LISTENERS_LATENCY("vds.state_manager.invoke_state_listeners_latency", Unit.MILLISECOND, "Time spent (in ms) propagating state changes to internal state listeners"),
+ VDS_VISITOR_CV_QUEUEEVICTEDWAITTIME("vds.visitor.cv_queueevictedwaittime", Unit.MILLISECOND, "Milliseconds waiting in create visitor queue, for visitors that was evicted from queue due to higher priority visitors coming"),
+ VDS_VISITOR_CV_QUEUEFULL("vds.visitor.cv_queuefull", Unit.OPERATION, "Number of create visitor messages failed as queue is full"),
+ VDS_VISITOR_CV_QUEUESIZE("vds.visitor.cv_queuesize", Unit.ITEM, "Size of create visitor queue"),
+ VDS_VISITOR_CV_QUEUETIMEOUTWAITTIME("vds.visitor.cv_queuetimeoutwaittime", Unit.MILLISECOND, "Milliseconds waiting in create visitor queue, for visitors that timed out while in the visitor quueue"),
+ VDS_VISITOR_CV_QUEUEWAITTIME("vds.visitor.cv_queuewaittime", Unit.MILLISECOND, "Milliseconds waiting in create visitor queue, for visitors that was added to visitor queue but scheduled later"),
+ VDS_VISITOR_CV_SKIPQUEUE("vds.visitor.cv_skipqueue", Unit.OPERATION, "Number of times we could skip queue as we had free visitor spots"),
+
+ // C++ capability metrics
+ VDS_SERVER_NETWORK_RPC_CAPABILITY_CHECKS_FAILED("vds.server.network.rpc-capability-checks-failed", Unit.FAILURE, "Number of RPC operations that failed to due one or more missing capabilities"),
+ VDS_SERVER_NETWORK_STATUS_CAPABILITY_CHECKS_FAILED("vds.server.network.status-capability-checks-failed", Unit.FAILURE, "Number of status page operations that failed to due one or more missing capabilities"),
+
+ // C++ Fnet metrics
+ VDS_SERVER_FNET_NUM_CONNECTIONS("vds.server.fnet.num-connections", Unit.CONNECTION, "Total number of connection objects");
+
+
+ private final String name;
+ private final Unit unit;
+ private final String description;
+
+ StorageMetrics(String name, Unit unit, String description) {
+ this.name = name;
+ this.unit = unit;
+ this.description = description;
+ }
+
+ public String baseName() {
+ return name;
+ }
+
+ public Unit unit() {
+ return unit;
+ }
+
+ public String description() {
+ return description;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/Suffix.java b/metrics/src/main/java/ai/vespa/metrics/Suffix.java
new file mode 100644
index 00000000000..ce5e0aaa602
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/Suffix.java
@@ -0,0 +1,24 @@
+package ai.vespa.metrics;
+
+public enum Suffix {
+ ninety_five_percentile("95percentile"),
+ ninety_nine_percentile("99percentile"),
+ average("average"),
+ count("count"),
+ last("last"),
+ max("max"),
+ min("min"),
+ rate("rate"),
+ sum("sum");
+
+ private final String suffix;
+
+ Suffix(String suffix) {
+ this.suffix = suffix;
+ }
+
+ public String suffix() {
+ return suffix;
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/Unit.java b/metrics/src/main/java/ai/vespa/metrics/Unit.java
new file mode 100644
index 00000000000..a2123d72246
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/Unit.java
@@ -0,0 +1,129 @@
+package ai.vespa.metrics;
+
+/**
+ * @author gjoranv
+ */
+public enum Unit {
+
+ BINARY(BaseUnit.BINARY, "Zero or one. Zero typically indicate \"false\" while one indicate \"true\""),
+ BUCKET(BaseUnit.BUCKET, "A chunk of documents managed by a distributor service"),
+ BYTE(BaseUnit.BYTE, "A collection of 8 bits"),
+ BYTE_PER_SECOND(BaseUnit.BYTE, BaseUnit.SECOND, "A unit of storage capable of holding 8 bits"),
+ CONNECTION(BaseUnit.CONNECTION, "A link used for communication between a client and a server"),
+ DOCUMENT(BaseUnit.DOCUMENT, "Vespa document, a collection of fields defined in a schema file"),
+ DOCUMENTID(BaseUnit.DOCUMENTID, "A unique document identifier"),
+ FAILURE(BaseUnit.FAILURE, "Failures, typically for requests, operations or nodes"),
+ FILE(BaseUnit.FILE, "Data file stored on the disk on a node"),
+ FRACTION(BaseUnit.FRACTION, "A value in the range [0..1]. Higher values can occur for some metrics, but would indicate the value is outside of the allowed range."),
+ HIT(BaseUnit.HIT, "Document that meets the filtering/restriction criteria specified by a given query"),
+ HIT_PER_QUERY(BaseUnit.HIT, BaseUnit.QUERY, "Number of hits per query over a period of time"),
+ INSTANCE(BaseUnit.INSTANCE, "Typically tenant or application"),
+ ITEM(BaseUnit.ITEM, "Object or unit maintained in e.g. a queue"),
+ MILLISECOND(BaseUnit.MILLISECOND, "Millisecond, 1/1000 of a second"),
+ NANOSECOND(BaseUnit.NANOSECOND, "Nanosecond, 1/1000.000.000 of a second"),
+ NODE(BaseUnit.NODE, "(Virtual) computer that is part of a Vespa cluster"),
+ PACKET(BaseUnit.PACKET, "Collection of data transmitted over the network as a single unit"),
+ OPERATION(BaseUnit.OPERATION, "A clearly defined task"),
+ OPERATION_PER_SECOND(BaseUnit.OPERATION, BaseUnit.SECOND, "Number of operations per second"),
+ PERCENTAGE(BaseUnit.PERCENTAGE, "A number expressed as a fraction of 100. Typically in the range [0..100]."),
+ QUERY(BaseUnit.QUERY, "A request for matching, grouping and/or scoring documents stored in Vespa"),
+ QUERY_PER_SECOND(BaseUnit.QUERY, BaseUnit.SECOND, "Number of queries per second."),
+ RECORD(BaseUnit.RECORD, "A collection of information, typically a set of key/value, e.g. stored in a transaction log"),
+ REQUEST(BaseUnit.REQUEST, "A request sent from a client to a server"),
+ RESPONSE(BaseUnit.RESPONSE, "A response from a server to a client, typically as a response to a request"),
+ RESTART(BaseUnit.RESTART, "A service or node restarts"),
+ SCORE(BaseUnit.SCORE, "Relevance score for a document"),
+ SECOND(BaseUnit.SECOND, "Time span of 1 second"),
+ SESSION(BaseUnit.SESSION, "A set of operations taking place during one connection or as part of a higher level operation"),
+ TASK(BaseUnit.TASK, "Piece of work executed by a server, e.g. to perform back-ground data maintenance"),
+ THREAD(BaseUnit.THREAD, "Computer thread for executing e.g. tasks, operations or queries"),
+ VERSION(BaseUnit.VERSION, "Software or config version"),
+ WAKEUP(BaseUnit.WAKEUP, "Computer thread wake-ups for doing some work");
+
+
+ private final BaseUnit unit;
+ private final BaseUnit perUnit;
+ private final String description;
+
+ Unit(BaseUnit unit, String description) {
+ this(unit, null, description);
+ }
+
+ Unit(BaseUnit unit, BaseUnit perUnit, String description) {
+ this.unit = unit;
+ this.perUnit = perUnit;
+ this.description = description;
+ }
+
+ public String fullName() {
+ return perUnit == null ?
+ unit.fullName() :
+ unit.fullName() + "/" + perUnit.fullName();
+ }
+
+ public String shortName() {
+ return perUnit == null ?
+ unit.shortName :
+ unit.shortName + "/" + perUnit.shortName;
+ }
+
+ public String getDescription() {
+ return description;
+ }
+
+ private enum BaseUnit {
+
+ AREA("area"),
+ BINARY("binary"),
+ BUCKET("bucket"),
+ BYTE("byte"),
+ CONNECTION("connection"),
+ DOCUMENT("document"),
+ DOCUMENTID("documentid"),
+ FAILURE("failure"),
+ FILE("file"),
+ FRACTION("fraction"),
+ HIT("hit"),
+ INSTANCE("instance"),
+ ITEM("item"),
+ MILLISECOND("millisecond", "ms"),
+ NANOSECOND("nanosecond", "ns"),
+ NODE("node"),
+ OPERATION("operation"),
+ PACKET("packet"),
+ PERCENTAGE("percentage"),
+ QUERY("query"),
+ RECORD("record"),
+ REQUEST("request"),
+ RESPONSE("response"),
+ RESTART("restart"),
+ SCORE("score"),
+ SECOND("second", "s"),
+ SESSION("session"),
+ TASK("task"),
+ THREAD("thread"),
+ VERSION("version"),
+ WAKEUP("wakeup");
+
+ private final String fullName;
+ private final String shortName;
+
+ BaseUnit(String fullName) {
+ this(fullName, fullName);
+ }
+
+ BaseUnit(String fullName, String shortName) {
+ this.fullName = fullName;
+ this.shortName = shortName;
+ }
+
+ public String fullName() {
+ return fullName;
+ }
+
+ public String shortName() {
+ return shortName;
+ }
+
+ }
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/VespaMetrics.java b/metrics/src/main/java/ai/vespa/metrics/VespaMetrics.java
new file mode 100644
index 00000000000..3a17d8a3155
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/VespaMetrics.java
@@ -0,0 +1,56 @@
+package ai.vespa.metrics;
+
+/**
+ * @author gjoranv
+ */
+public interface VespaMetrics {
+
+ String baseName();
+ Unit unit();
+ String description();
+
+ default String descriptionWitUnit() {
+ return description() + " (unit: " + unit().shortName() + ")";
+ }
+
+ private String withSuffix(Suffix suffix) {
+ return baseName() + "." + suffix.suffix();
+ }
+
+ default String ninety_five_percentile() {
+ return withSuffix(Suffix.ninety_five_percentile);
+ }
+
+ default String ninety_nine_percentile() {
+ return withSuffix(Suffix.ninety_nine_percentile);
+ }
+
+ default String average() {
+ return withSuffix(Suffix.average);
+ }
+
+ default String count() {
+ return withSuffix(Suffix.count);
+ }
+
+ default String last() {
+ return withSuffix(Suffix.last);
+ }
+
+ default String max() {
+ return withSuffix(Suffix.max);
+ }
+
+ default String min() {
+ return withSuffix(Suffix.min);
+ }
+
+ default String rate() {
+ return withSuffix(Suffix.rate);
+ }
+
+ default String sum() {
+ return withSuffix(Suffix.sum);
+ }
+
+}
diff --git a/metrics/src/main/java/ai/vespa/metrics/package-info.java b/metrics/src/main/java/ai/vespa/metrics/package-info.java
new file mode 100644
index 00000000000..98986f61dc5
--- /dev/null
+++ b/metrics/src/main/java/ai/vespa/metrics/package-info.java
@@ -0,0 +1,5 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+package ai.vespa.metrics;
+
+import com.yahoo.osgi.annotation.ExportPackage;