diff options
author | gjoranv <gv@yahooinc.com> | 2023-05-08 12:44:34 +0200 |
---|---|---|
committer | gjoranv <gv@yahooinc.com> | 2023-05-08 12:44:34 +0200 |
commit | be2ad5ff2b54bb8fdf145682005052fe26e1547d (patch) | |
tree | e86933d7cc037d11fdfedc3fb5a325f4d99357f1 /metrics | |
parent | 0c97250ee7c1bab4d3af448c8fea5e5ccdb9638a (diff) |
Move metrics definitions to metrics:ai.vespa.metrics
Diffstat (limited to 'metrics')
17 files changed, 1606 insertions, 0 deletions
diff --git a/metrics/pom.xml b/metrics/pom.xml index f44a2569ace..e8303e5a01f 100644 --- a/metrics/pom.xml +++ b/metrics/pom.xml @@ -15,6 +15,12 @@ <dependencies> <dependency> <groupId>com.yahoo.vespa</groupId> + <artifactId>annotations</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>com.yahoo.vespa</groupId> <artifactId>config-lib</artifactId> <version>${project.version}</version> <scope>provided</scope> diff --git a/metrics/src/main/java/ai/vespa/metrics/ClusterControllerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ClusterControllerMetrics.java new file mode 100644 index 00000000000..545de543663 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/ClusterControllerMetrics.java @@ -0,0 +1,53 @@ +package ai.vespa.metrics; + +/** + * @author yngve + */ +public enum ClusterControllerMetrics implements VespaMetrics { + + DOWN_COUNT("cluster-controller.down.count", Unit.NODE, "Number of content nodes down"), + INITIALIZING_COUNT("cluster-controller.initializing.count", Unit.NODE, "Number of content nodes initializing"), + MAINTENANCE_COUNT("cluster-controller.maintenance.count", Unit.NODE, "Number of content nodes in maintenance"), + RETIRED_COUNT("cluster-controller.retired.count", Unit.NODE, "Number of content nodes that are retired"), + STOPPING_COUNT("cluster-controller.stopping.count", Unit.NODE, "Number of content nodes currently stopping"), + UP_COUNT("cluster-controller.up.count", Unit.NODE, "Number of content nodes up"), + CLUSTER_STATE_CHANGE_COUNT("cluster-controller.cluster-state-change.count", Unit.NODE, "Number of nodes changing state"), + BUSY_TICK_TIME_MS("cluster-controller.busy-tick-time-ms", Unit.MILLISECOND, "Time busy"), + IDLE_TICK_TIME_MS("cluster-controller.idle-tick-time-ms", Unit.MILLISECOND, "Time idle"), + WORK_MS("cluster-controller.work-ms", Unit.MILLISECOND, "Time used for actual work"), + IS_MASTER("cluster-controller.is-master", Unit.BINARY, "1 if this cluster controller is currently the master, or 0 if not"), + REMOTE_TASK_QUEUE_SIZE("cluster-controller.remote-task-queue.size", Unit.OPERATION, "Number of remote tasks queued"), + // TODO(hakonhall): Update this name once persistent "count" metrics has been implemented. + // DO NOT RELY ON THIS METRIC YET. + NODE_EVENT_COUNT("cluster-controller.node-event.count", Unit.OPERATION, "Number of node events"), + RESOURCE_USAGE_NODES_ABOVE_LIMIT("cluster-controller.resource_usage.nodes_above_limit", Unit.NODE, "The number of content nodes above resource limit, blocking feed"), + RESOURCE_USAGE_MAX_MEMORY_UTILIZATION("cluster-controller.resource_usage.max_memory_utilization", Unit.FRACTION, "Current memory utilisation, per content node"), + RESOURCE_USAGE_MAX_DISK_UTILIZATION("cluster-controller.resource_usage.max_disk_utilization", Unit.FRACTION, "Current disk space utilisation, per content node"), + RESOURCE_USAGE_MEMORY_LIMIT("cluster-controller.resource_usage.memory_limit", Unit.FRACTION, "Disk space limit as a fraction of available disk space"), + RESOURCE_USAGE_DISK_LIMIT("cluster-controller.resource_usage.disk_limit", Unit.FRACTION, "Memory space limit as a fraction of available memory"), + REINDEXING_PROGRESS("reindexing.progress", Unit.FRACTION, "Re-indexing progress"); + + + private final String name; + private final Unit unit; + private final String description; + + ClusterControllerMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java new file mode 100644 index 00000000000..dba75c08aae --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java @@ -0,0 +1,60 @@ +package ai.vespa.metrics; + +/** + * @author yngveaasheim + */ +public enum ConfigServerMetrics implements VespaMetrics { + + REQUESTS("configserver.requests", Unit.REQUEST, "Number of requests processed"), + FAILED_REQUESTS("configserver.failedRequests", Unit.REQUEST, "Number of requests that failed"), + LATENCY("configserver.latency", Unit.MILLISECOND, "Time to complete requests"), + CACHE_CONFIG_ELEMS("configserver.cacheConfigElems", Unit.ITEM, "Time to complete requests"), + CACHE_CHECKSUM_ELEMS("configserver.cacheChecksumElems", Unit.ITEM, "Number of checksum elements in the cache"), + HOSTS("configserver.hosts", Unit.NODE, "The number of nodes being served configuration from the config server cluster"), + TENANTS("configserver.tenants", Unit.INSTANCE, "The number of tenants being served configuration from the config server cluster"), + APPLICATIONS("configserver.applications", Unit.INSTANCE, "The number of applications being served configuration from the config server cluster"), + DELAYED_RESPONSES("configserver.delayedResponses", Unit.RESPONSE, "Number of delayed responses"), + SESSION_CHANGE_ERRORS("configserver.sessionChangeErrors", Unit.SESSION, "Number of session change errors"), + UNKNOWN_HOST_REQUEST("configserver.unknownHostRequests", Unit.REQUEST, "Config requests from unknown hosts"), + NEW_SESSIONS("configserver.newSessions", Unit.SESSION, "New config sessions"), + PREPARED_SESSIONS("configserver.preparedSessions", Unit.SESSION, "Prepared config sessions"), + ACTIVE_SESSIONS("configserver.activeSessions", Unit.SESSION, "Active config sessions"), + INACTIVE_SESSIONS("configserver.inactiveSessions", Unit.SESSION, "Inactive config sessions"), + ADDED_SESSIONS("configserver.addedSessions", Unit.SESSION, "Added config sessions"), + REMOVED_SESSIONS("configserver.removedSessions", Unit.SESSION, "Removed config sessions"), + RPC_SERVER_WORK_QUEUE_SIZE("configserver.rpcServerWorkQueueSize", Unit.ITEM, "Number of elements in the RPC server work queue"), + + // ZooKeeper related metrics + ZK_CONNECTIONS_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"), + ZK_RECONNECTED("configserver.zkReconnected", Unit.CONNECTION, "Number of ZooKeeper reconnections"), + ZK_CONNECTED("configserver.zkConnected", Unit.NODE, "Number of ZooKeeper nodes connected"), + ZK_SUSPENDED("configserver.zkSuspended", Unit.NODE, "Number of ZooKeeper nodes suspended"), + ZK_Z_NODES("configserver.zkZNodes", Unit.NODE, "Number of ZooKeeper nodes present"), + ZK_AVG_LATENCY("configserver.zkAvgLatency", Unit.MILLISECOND, "Average latency for ZooKeeper requests"), // TODO: Confirm metric name + ZK_MAX_LATENCY("configserver.zkMaxLatency", Unit.MILLISECOND, "Max latency for ZooKeeper requests"), + ZK_CONNECTIONS("configserver.zkConnections", Unit.CONNECTION, "Number of ZooKeeper connections"), + ZK_OUTSTANDING_REQUESTS("configserver.zkOutstandingRequests", Unit.REQUEST, "Number of ZooKeeper requests in flight"); + + private final String name; + private final Unit unit; + private final String description; + + ConfigServerMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java new file mode 100644 index 00000000000..84de1878b3f --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/ContainerMetrics.java @@ -0,0 +1,228 @@ +package ai.vespa.metrics; + +/** + * @author gjoranv + */ +public enum ContainerMetrics implements VespaMetrics { + + HTTP_STATUS_1XX("http.status.1xx", Unit.RESPONSE, "Number of responses with a 1xx status"), + HTTP_STATUS_2XX("http.status.2xx", Unit.RESPONSE, "Number of responses with a 2xx status"), + HTTP_STATUS_3XX("http.status.3xx", Unit.RESPONSE, "Number of responses with a 3xx status"), + HTTP_STATUS_4XX("http.status.4xx", Unit.RESPONSE, "Number of responses with a 4xx status"), + HTTP_STATUS_5XX("http.status.5xx", Unit.RESPONSE, "Number of responses with a 5xx status"), + + APPLICATION_GENERATION("application_generation", Unit.VERSION, "The currently live application config generation (aka session id)"), + IN_SERVICE("in_service", Unit.BINARY, "This will have the value 1 if the node is in service, 0 if not."), + + JDISC_GC_COUNT("jdisc.gc.count", Unit.OPERATION, "Number of JVM garbage collections done"), + JDISC_GC_MS("jdisc.gc.ms", Unit.MILLISECOND, "Time spent in JVM garbage collection"), + JDISC_JVM("jdisc.jvm", Unit.VERSION, "JVM runtime version"), + JDISC_MEMORY_MAPPINGS("jdisc.memory_mappings", Unit.OPERATION, "JDISC Memory mappings"), + JDISC_OPEN_FILE_DESCRIPTORS("jdisc.open_file_descriptors", Unit.ITEM, "JDISC Open file descriptors"), + + JDISC_THREAD_POOL_UNHANDLED_EXCEPTIONS("jdisc.thread_pool.unhandled_exceptions", Unit.THREAD, "Number of exceptions thrown by tasks"), + JDISC_THREAD_POOL_WORK_QUEUE_CAPACITY("jdisc.thread_pool.work_queue.capacity", Unit.THREAD, "Capacity of the task queue"), + JDISC_THREAD_POOL_WORK_QUEUE_SIZE("jdisc.thread_pool.work_queue.size", Unit.THREAD, "Size of the task queue"), + JDISC_THREAD_POOL_REJECTED_TASKS("jdisc.thread_pool.rejected_tasks", Unit.THREAD, "Number of tasks rejected by the thread pool"), + JDISC_THREAD_POOL_SIZE("jdisc.thread_pool.size", Unit.THREAD, "Size of the thread pool"), + JDISC_THREAD_POOL_MAX_ALLOWED_SIZE("jdisc.thread_pool.max_allowed_size", Unit.THREAD, "The maximum allowed number of threads in the pool"), + JDISC_THREAD_POOL_ACTIVE_THREADS("jdisc.thread_pool.active_threads", Unit.THREAD, "Number of threads that are active"), + + JDISC_DEACTIVATED_CONTAINERS("jdisc.deactivated_containers.total", Unit.ITEM, "JDISC Deactivated container instances"), + JDISC_DEACTIVATED_CONTAINERS_WITH_RETAINED_REFS("jdisc.deactivated_containers.with_retained_refs.last", Unit.ITEM, "JDISC Deactivated container nodes with retained refs"), + JDISC_APPLICATION_FAILED_COMPONENT_GRAPHS("jdisc.application.failed_component_graphs", Unit.ITEM, "JDISC Application failed component graphs"), + JDISC_APPLICATION_COMPONENT_GRAPH_CREATION_TIME_MILLIS("jdisc.application.component_graph.creation_time_millis", Unit.MILLISECOND, "JDISC Application component graph creation time"), + JDISC_APPLICATION_COMPONENT_GRAPH_RECONFIGURATIONS("jdisc.application.component_graph.reconfigurations", Unit.ITEM, "JDISC Application component graph reconfigurations"), + + JDISC_SINGLETON_IS_ACTIVE("jdisc.singleton.is_active", Unit.ITEM, "JDISC Singleton is active"), + JDISC_SINGLETON_ACTIVATION_COUNT("jdisc.singleton.activation.count", Unit.OPERATION, "JDISC Singleton activations"), + JDISC_SINGLETON_ACTIVATION_FAILURE_COUNT("jdisc.singleton.activation.failure.count", Unit.OPERATION, "JDISC Singleton activation failures"), + JDISC_SINGLETON_ACTIVATION_MILLIS("jdisc.singleton.activation.millis", Unit.MILLISECOND, "JDISC Singleton activation time"), + JDISC_SINGLETON_DEACTIVATION_COUNT("jdisc.singleton.deactivation.count", Unit.OPERATION, "JDISC Singleton deactivations"), + JDISC_SINGLETON_DEACTIVATION_FAILURE_COUNT("jdisc.singleton.deactivation.failure.count", Unit.OPERATION, "JDISC Singleton deactivation failures"), + JDISC_SINGLETON_DEACTIVATION_MILLIS("jdisc.singleton.deactivation.millis", Unit.MILLISECOND, "JDISC Singleton deactivation time"), + + JDISC_HTTP_SSL_HANDSHAKE_FAILURE_MISSING_CLIENT_CERT("jdisc.http.ssl.handshake.failure.missing_client_cert", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to missing client certificate"), + JDISC_HTTP_SSL_HANDSHAKE_FAILURE_EXPIRED_CLIENT_CERT("jdisc.http.ssl.handshake.failure.expired_client_cert", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to expired client certificate"), + JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INVALID_CLIENT_CERT("jdisc.http.ssl.handshake.failure.invalid_client_cert", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to invalid client certificate"), + JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INCOMPATIBLE_PROTOCOLS("jdisc.http.ssl.handshake.failure.incompatible_protocols", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to inincompatible protocols"), + JDISC_HTTP_SSL_HANDSHAKE_FAILURE_INCOMPATIBLE_CHIFERS("jdisc.http.ssl.handshake.failure.incompatible_chifers", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to incompatible chifers"), + JDISC_HTTP_SSL_HANDSHAKE_FAILURE_CONNECTION_CLOSED("jdisc.http.ssl.handshake.failure.connection_closed", Unit.OPERATION, "JDISC HTTP SSL Handshake failures due to connection closed"), + JDISC_HTTP_SSL_HANDSHAKE_FAILURE_UNKNOWN("jdisc.http.ssl.handshake.failure.unknown", Unit.OPERATION, "JDISC HTTP SSL Handshake failures for unknown reason"), + + JDISC_HTTP_REQUEST_PREMATURELY_CLOSED("jdisc.http.request.prematurely_closed", Unit.REQUEST, "HTTP requests prematurely closed"), + JDISC_HTTP_REQUEST_REQUESTS_PER_CONNECTION("jdisc.http.request.requests_per_connection", Unit.REQUEST, "HTTP requests per connection"), + JDISC_HTTP_REQUEST_URI_LENGTH("jdisc.http.request.uri_length", Unit.BYTE, "HTTP URI length"), + JDISC_HTTP_REQUEST_CONTENT_SIZE("jdisc.http.request.content_size", Unit.BYTE, "HTTP request content size"), + JDISC_HTTP_REQUESTS("jdisc.http.requests", Unit.REQUEST, "HTTP requests"), + JDISC_HTTP_REQUESTS_STATUS("jdisc.http.requests.status", Unit.REQUEST, "Number of requests to the built-in status handler"), + + JDISC_HTTP_FILTER_RULE_BLOCKED_REQUESTS("jdisc.http.filter.rule.blocked_requests", Unit.REQUEST, "Number of requests blocked by filter"), + JDISC_HTTP_FILTER_RULE_ALLOWED_REQUESTS("jdisc.http.filter.rule.allowed_requests", Unit.REQUEST, "Number of requests allowed by filter"), + JDISC_HTTP_FILTERING_REQUEST_HANDLED("jdisc.http.filtering.request.handled", Unit.REQUEST, "Number of filtering requests handled"), + JDISC_HTTP_FILTERING_REQUEST_UNHANDLED("jdisc.http.filtering.request.unhandled", Unit.REQUEST, "Number of filtering requests unhandled"), + JDISC_HTTP_FILTERING_RESPONSE_HANDLED("jdisc.http.filtering.response.handled", Unit.REQUEST, "Number of filtering responses handled"), + JDISC_HTTP_FILTERING_RESPONSE_UNHANDLED("jdisc.http.filtering.response.unhandled", Unit.REQUEST, "Number of filtering responses unhandled"), + JDISC_HTTP_HANDLER_UNHANDLED_EXCEPTIONS("jdisc.http.handler.unhandled_exceptions", Unit.REQUEST, "Number of unhandled exceptions in handler"), + + JDISC_TLS_CAPABILITY_CHECKS_SUCCEEDED("jdisc.tls.capability_checks.succeeded", Unit.OPERATION, "Number of TLS capability checks succeeded"), + JDISC_TLS_CAPABILITY_CHECKS_FAILED("jdisc.tls.capability_checks.failed", Unit.OPERATION, "Number of TLS capability checks failed"), + + JETTY_THREADPOOL_MAX_THREADS("jdisc.http.jetty.threadpool.thread.max", Unit.THREAD, "Configured maximum number of threads"), + JETTY_THREADPOOL_MIN_THREADS("jdisc.http.jetty.threadpool.thread.min", Unit.THREAD, "Configured minimum number of threads"), + JETTY_THREADPOOL_RESERVED_THREADS("jdisc.http.jetty.threadpool.thread.reserved", Unit.THREAD, "Configured number of reserved threads or -1 for heuristic"), + JETTY_THREADPOOL_BUSY_THREADS("jdisc.http.jetty.threadpool.thread.busy", Unit.THREAD, "Number of threads executing internal and transient jobs"), + JETTY_THREADPOOL_IDLE_THREADS("jdisc.http.jetty.threadpool.thread.idle", Unit.THREAD, "Number of idle threads"), + JETTY_THREADPOOL_TOTAL_THREADS("jdisc.http.jetty.threadpool.thread.total", Unit.THREAD, "Current number of threads"), + JETTY_THREADPOOL_QUEUE_SIZE("jdisc.http.jetty.threadpool.queue.size", Unit.THREAD, "Current size of the job queue"), + + SERVER_NUM_OPEN_CONNECTIONS("serverNumOpenConnections", Unit.CONNECTION, "The number of currently open connections"), + SERVER_NUM_CONNECTIONS("serverNumConnections", Unit.CONNECTION, "The total number of connections opened"), + + SERVER_BYTES_RECEIVED("serverBytesReceived", Unit.BYTE, "The number of bytes received by the server"), + SERVER_BYTES_SENT("serverBytesSent", Unit.BYTE, "The number of bytes sent from the server"), + + HANDLED_REQUESTS("handled.requests", Unit.OPERATION, "The number of requests handled per metrics snapshot"), + HANDLED_LATENCY("handled.latency", Unit.MILLISECOND, "The time used for requests during this metrics snapshot"), + + HTTPAPI_LATENCY("httpapi_latency", Unit.MILLISECOND, "Duration for requests to the HTTP document APIs"), + HTTPAPI_PENDING("httpapi_pending", Unit.OPERATION, "Document operations pending execution"), + HTTPAPI_NUM_OPERATIONS("httpapi_num_operations", Unit.OPERATION, "Total number of document operations performed"), + HTTPAPI_NUM_UPDATES("httpapi_num_updates", Unit.OPERATION, "Document update operations performed"), + HTTPAPI_NUM_REMOVES("httpapi_num_removes", Unit.OPERATION, "Document remove operations performed"), + HTTPAPI_NUM_PUTS("httpapi_num_puts", Unit.OPERATION, "Document put operations performed"), + HTTPAPI_SUCCEEDED("httpapi_succeeded", Unit.OPERATION, "Document operations that succeeded"), + HTTPAPI_FAILED("httpapi_failed", Unit.OPERATION, "Document operations that failed"), + HTTPAPI_PARSE_ERROR("httpapi_parse_error", Unit.OPERATION, "Document operations that failed due to document parse errors"), + HTTPAPI_CONDITION_NOT_MET("httpapi_condition_not_met", Unit.OPERATION, "Document operations not applied due to condition not met"), + HTTPAPI_NOT_FOUND("httpapi_not_found", Unit.OPERATION, "Document operations not applied due to document not found"), + HTTPAPI_FAILED_UNKNOWN("httpapi_failed_unknown", Unit.OPERATION, "Document operations failed by unknown cause"), + HTTPAPI_FAILED_TIMEOUT("httpapi_failed_timeout", Unit.OPERATION, "Document operations failed by timeout"), + HTTPAPI_FAILED_INSUFFICIENT_STORAGE("httpapi_failed_insufficient_storage", Unit.OPERATION, "Document operations failed by insufficient storage"), + + MEM_HEAP_TOTAL("mem.heap.total", Unit.BYTE, "Total available heap memory"), + MEM_HEAP_FREE("mem.heap.free", Unit.BYTE, "Free heap memory"), + MEM_HEAP_USED("mem.heap.used", Unit.BYTE, "Currently used heap memory"), + MEM_DIRECT_TOTAL("mem.direct.total", Unit.BYTE, "Total available direct memory"), + MEM_DIRECT_FREE("mem.direct.free", Unit.BYTE, "Currently free direct memory"), + MEM_DIRECT_USED("mem.direct.used", Unit.BYTE, "Direct memory currently used"), + MEM_DIRECT_COUNT("mem.direct.count", Unit.BYTE, "Number of direct memory allocations"), + MEM_NATIVE_TOTAL("mem.native.total", Unit.BYTE, "Total available native memory"), + MEM_NATIVE_FREE("mem.native.free", Unit.BYTE, "Currently free native memory"), + MEM_NATIVE_USED("mem.native.used", Unit.BYTE, "Native memory currently used"), + + ATHENZ_TENANT_CERT_EXPIRY_SECONDS("athenz-tenant-cert.expiry.seconds", Unit.SECOND, "Time remaining until Athenz tenant certificate expires"), + CONTAINER_IAM_ROLE_EXPIRY_SECONDS("container-iam-role.expiry.seconds", Unit.SECOND, "Time remaining until IAM role expires"), + + + // SearchChain metrics + PEAK_QPS("peak_qps", Unit.QUERY_PER_SECOND, "The highest number of qps for a second for this metrics shapshot"), + SEARCH_CONNECTIONS("search_connections", Unit.CONNECTION, "Number of search connections"), + FEED_OPERATIONS("feed.operations", Unit.OPERATION, "Number of document feed operations"), + FEED_LATENCY("feed.latency", Unit.MILLISECOND, "Feed latency"), + FEED_HTTP_REQUESTS("feed.http-requests", Unit.OPERATION, "Feed HTTP requests"), + QUERIES("queries", Unit.OPERATION, "Query volume"), + QUERY_CONTAINER_LATENCY("query_container_latency", Unit.MILLISECOND, "The query execution time consumed in the container"), + QUERY_LATENCY("query_latency", Unit.MILLISECOND, "The overall query latency as seen by the container"), + QUERY_TIMEOUT("query_timeout", Unit.MILLISECOND, "The amount of time allowed for query execytion, from the client"), + FAILED_QUERIES("failed_queries", Unit.OPERATION, "The number of failed queries"), + DEGRADED_QUERIES("degraded_queries", Unit.OPERATION, "The number of degraded queries, e.g. due to some conent nodes not responding in time"), + HITS_PER_QUERY("hits_per_query", Unit.HIT_PER_QUERY, "The number of hits returned"), + QUERY_HIT_OFFSET("query_hit_offset", Unit.HIT, "The offset for hits returned"), + DOCUMENTS_COVERED("documents_covered", Unit.DOCUMENT, "The combined number of documents considered during query evaluation"), + DOCUMENTS_TOTAL("documents_total", Unit.DOCUMENT, "The number of documents to be evaluated if all requests had been fully executed"), + DOCUMENTS_TARGET_TOTAL("documents_target_total", Unit.DOCUMENT, "The target number of total documents to be evaluated when when all data is in sync"), + JDISC_RENDER_LATENCY("jdisc.render.latency", Unit.NANOSECOND, "The time used by the container to render responses"), + QUERY_ITEM_COUNT("query_item_count", Unit.ITEM, "The number of query items (terms, phrases, etc)"), + DOCPROC_PROC_TIME("docproc.proctime", Unit.MILLISECOND, "Time spent processing document"), + DOCPROC_DOCUMENTS("docproc.documents", Unit.DOCUMENT, "Number of processed documents"), + + TOTAL_HITS_PER_QUERY("totalhits_per_query", Unit.HIT_PER_QUERY, "The total number of documents found to match queries"), + EMPTY_RESULTS("empty_results", Unit.OPERATION, "Number of queries matching no documents"), + REQUESTS_OVER_QUOTA("requestsOverQuota", Unit.OPERATION, "The number of requests rejected due to exceeding quota"), + + RELEVANCE_AT_1("relevance.at_1", Unit.SCORE, "The relevance of hit number 1"), + RELEVANCE_AT_3("relevance.at_3", Unit.SCORE, "The relevance of hit number 3"), + RELEVANCE_AT_10("relevance.at_10", Unit.SCORE, "The relevance of hit number 10"), + + // Errors from search container + ERROR_TIMEOUT("error.timeout", Unit.OPERATION, "Requests that timed out"), + ERROR_BACKENDS_OOS("error.backends_oos", Unit.OPERATION, "Requests that failed due to no available backends nodes"), + ERROR_PLUGIN_FAILURE("error.plugin_failure", Unit.OPERATION, "Requests that failed due to plugin failure"), + ERROR_BACKEND_COMMUNICATION_ERROR("error.backend_communication_error", Unit.OPERATION, "Requests that failed due to backend communication error"), + ERROR_EMPTY_DOCUMENT_SUMMARIES("error.empty_document_summaries", Unit.OPERATION, "Requests that failed due to missing document summaries"), + ERROR_ILLEGAL_QUERY("error.illegal_query", Unit.OPERATION, "Requests that failed due to illegal queries"), + ERROR_INVALID_QUERY_PARAMETER("error.invalid_query_parameter", Unit.OPERATION, "Requests that failed due to invalid query parameters"), + ERROR_INTERNAL_SERVER_ERROR("error.internal_server_error", Unit.OPERATION, "Requests that failed due to internal server error"), + ERROR_MISCONFIGURED_SERVER("error.misconfigured_server", Unit.OPERATION, "Requests that failed due to misconfigured server"), + ERROR_INVALID_QUERY_TRANSFORMATION("error.invalid_query_transformation", Unit.OPERATION, "Requests that failed due to invalid query transformation"), + ERROR_RESULTS_WITH_ERRORS("error.results_with_errors", Unit.OPERATION, "The number of queries with error payload"), + ERROR_UNSPECIFIED("error.unspecified", Unit.OPERATION, "Requests that failed for an unspecified reason"), + ERROR_UNHANDLED_EXCEPTION("error.unhandled_exception", Unit.OPERATION, "Requests that failed due to an unhandled exception"), + + // Deprecated metrics. TODO: Remove in Vespa 9. + SERVER_REJECTED_REQUESTS("serverRejectedRequests", Unit.OPERATION, "Deprecated. Use jdisc.thread_pool.rejected_tasks instead."), + SERVER_THREAD_POOL_SIZE("serverThreadPoolSize", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.size instead."), + SERVER_ACTIVE_THREADS("serverActiveThreads", Unit.THREAD, "Deprecated. Use jdisc.thread_pool.active_threads instead."), + + // Java (JRT) TLS metrics + JRT_TRANSPORT_TLS_CERTIFICATE_VERIFICATION_FAILURES("jrt.transport.tls-certificate-verification-failures", Unit.FAILURE, "TLS certificate verification failures"), + JRT_TRANSPORT_PEER_AUTHORIZATION_FAILURES("jrt.transport.peer-authorization-failures", Unit.FAILURE, "TLS peer authorization failures"), + JRT_TRANSPORT_SERVER_TLS_CONNECIONTS_ESTABLISHED("jrt.transport.server.tls-connections-established", Unit.CONNECTION, "TLS server connections established"), + JRT_TRANSPORT_CLIENT_TLS_CONNECTIONS_ESTABLISHED("jrt.transport.client.tls-connections-established", Unit.CONNECTION, "TLS client connections established"), + JRT_TRANSPORT_SERVER_UNENCRYPTED_CONNECTIONS_ESTABLISHED("jrt.transport.server.unencrypted-connections-established", Unit.CONNECTION, "Unencrypted server connections established"), + JRT_TRANSPORT_CLIENT_UNENCRYPTED_CONNECTIONS_ESTABLISHED("jrt.transport.client.unencrypted-connections-established", Unit.CONNECTION, "Unencrypted client connections established"), + + MAX_QUERY_LATENCY("max_query_latency", Unit.MILLISECOND, "Deprecated. Use query_latency.max instead"), // TODO: Remove in Vespa 9 + MEAN_QUERY_LATENCY("mean_query_latency", Unit.MILLISECOND, "Deprecated. Use the expression (query_latency.sum / query_latency.count) instead"),// TODO: Remove in Vespa 9 + + + // Metrics defined in com/yahoo/jdisc/http/filter/security/athenz/AthenzAuthorizationFilter.java + JDISC_HTTP_FILTER_ATHENZ_ACCEPTED_REQUESTS("jdisc.http.filter.athenz.accepted_requests", Unit.REQUEST, "Number of requests accepted by the AthenzAuthorization filter"), + JDISC_HTTP_FILTER_ATHENZ_REJECTED_REQUESTS("jdisc.http.filter.athenz.rejected_requests", Unit.REQUEST, "Number of requests rejected by the AthenzAuthorization filter"), + + + // Metrics defined in com/yahoo/jdisc/http/server/jetty/MetricDefinitions.java + SERVER_CONNECTIONS_OPEN_MAX("serverConnectionsOpenMax", Unit.CONNECTION, "Maximum number of open connections"), + SERVER_CONNECTION_DURATION_MAX("serverConnectionDurationMax", Unit.MILLISECOND, "Longest duration a connection is kept open"), + + SERVER_CONNECTION_DURATION_MEAN("serverConnectionDurationMean", Unit.MILLISECOND, "Average duration a connection is kept open"), + SERVER_CONNECTION_DURATION_STD_DEV("serverConnectionDurationStdDev", Unit.MILLISECOND, "Standard deviation of open connection duration"), + SERVER_NUM_REQUESTS("serverNumRequests", Unit.REQUEST, "Number of requests"), + SERVER_NUM_SUCCESSFUL_RESPONSES("serverNumSuccessfulResponses", Unit.REQUEST, "Number of successful responses"), + SERVER_NUM_FAILED_RESPONSES("serverNumFailedResponses", Unit.REQUEST, "Number of failed responses"), + + SERVER_NUM_SUCCESSFUL_RESPONSE_WRITES("serverNumSuccessfulResponseWrites", Unit.REQUEST, "Number of successful response writes"), + SERVER_NUM_FAILED_RESPONSE_WRITES("serverNumFailedResponseWrites", Unit.REQUEST, "Number of failed response writes"), + + SERVER_TOTAL_SUCCESFUL_RESPONSE_LATENCY("serverTotalSuccessfulResponseLatency", Unit.MILLISECOND, "Total duration for execution of successful responses"), + SERVER_TOTAL_FAILED_RESPONSE_LATENCY("serverTotalFailedResponseLatency", Unit.MILLISECOND, "Total duration for execution of failed responses"), + SERVER_TIME_TO_FIRST_BYTE("serverTimeToFirstByte", Unit.MILLISECOND, "Time from request has been received by the server until the first byte is returned to the client"), + + SERVER_STARTED_MILLIS("serverStartedMillis", Unit.MILLISECOND, "Time since the service was started"); + + + + private final String name; + private final Unit unit; + private final String description; + + ContainerMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/DistributorMetrics.java b/metrics/src/main/java/ai/vespa/metrics/DistributorMetrics.java new file mode 100644 index 00000000000..b2ebbc4d6aa --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/DistributorMetrics.java @@ -0,0 +1,257 @@ +package ai.vespa.metrics; + +/** + * @author yngveaasheim + */ +public enum DistributorMetrics implements VespaMetrics { + + VDS_IDEALSTATE_BUCKETS_RECHECKING("vds.idealstate.buckets_rechecking", Unit.BUCKET, "The number of buckets that we are rechecking for ideal state operations"), + VDS_IDEALSTATE_IDEALSTATE_DIFF("vds.idealstate.idealstate_diff", Unit.BUCKET, "A number representing the current difference from the ideal state. This is a number that decreases steadily as the system is getting closer to the ideal state"), + VDS_IDEALSTATE_BUCKETS_TOOFEWCOPIES("vds.idealstate.buckets_toofewcopies", Unit.BUCKET, "The number of buckets the distributor controls that have less than the desired redundancy"), + VDS_IDEALSTATE_BUCKETS_TOOMANYCOPIES("vds.idealstate.buckets_toomanycopies", Unit.BUCKET, "The number of buckets the distributor controls that have more than the desired redundancy"), + VDS_IDEALSTATE_BUCKETS("vds.idealstate.buckets", Unit.BUCKET, "The number of buckets the distributor controls"), + VDS_IDEALSTATE_BUCKETS_NOTRUSTED("vds.idealstate.buckets_notrusted", Unit.BUCKET, "The number of buckets that have no trusted copies."), + VDS_IDEALSTATE_BUCKET_REPLICAS_MOVING_OUT("vds.idealstate.bucket_replicas_moving_out", Unit.BUCKET, "Bucket replicas that should be moved out, e.g. retirement case or node added to cluster that has higher ideal state priority."), + VDS_IDEALSTATE_BUCKET_REPLICAS_COPYING_OUT("vds.idealstate.bucket_replicas_copying_out", Unit.BUCKET, "Bucket replicas that should be copied out, e.g. node is in ideal state but might have to provide data other nodes in a merge"), + VDS_IDEALSTATE_BUCKET_REPLICAS_COPYING_IN("vds.idealstate.bucket_replicas_copying_in", Unit.BUCKET, "Bucket replicas that should be copied in, e.g. node does not have a replica for a bucket that it is in ideal state for"), + VDS_IDEALSTATE_BUCKET_REPLICAS_SYNCING("vds.idealstate.bucket_replicas_syncing", Unit.BUCKET, "Bucket replicas that need syncing due to mismatching metadata"), + VDS_IDEALSTATE_MAX_OBSERVED_TIME_SINCE_LAST_GC_SEC("vds.idealstate.max_observed_time_since_last_gc_sec", Unit.SECOND, "Maximum time (in seconds) since GC was last successfully run for a bucket. Aggregated max value across all buckets on the distributor."), + VDS_IDEALSTATE_DELETE_BUCKET_DONE_OK("vds.idealstate.delete_bucket.done_ok", Unit.OPERATION, "The number of operations successfully performed"), + VDS_IDEALSTATE_DELETE_BUCKET_DONE_FAILED("vds.idealstate.delete_bucket.done_failed", Unit.OPERATION, "The number of operations that failed"), + VDS_IDEALSTATE_DELETE_BUCKET_PENDING("vds.idealstate.delete_bucket.pending", Unit.OPERATION, "The number of operations pending"), + VDS_IDEALSTATE_DELETE_BUCKET_BLOCKED("vds.idealstate.delete_bucket.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"), + VDS_IDEALSTATE_DELETE_BUCKET_THROTTLED("vds.idealstate.delete_bucket.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"), + VDS_IDEALSTATE_MERGE_BUCKET_DONE_OK("vds.idealstate.merge_bucket.done_ok", Unit.OPERATION, "The number of operations successfully performed"), + VDS_IDEALSTATE_MERGE_BUCKET_DONE_FAILED("vds.idealstate.merge_bucket.done_failed", Unit.OPERATION, "The number of operations that failed"), + VDS_IDEALSTATE_MERGE_BUCKET_PENDING("vds.idealstate.merge_bucket.pending", Unit.OPERATION, "The number of operations pending"), + VDS_IDEALSTATE_MERGE_BUCKET_BLOCKED("vds.idealstate.merge_bucket.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"), + VDS_IDEALSTATE_MERGE_BUCKET_THROTTLED("vds.idealstate.merge_bucket.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"), + VDS_IDEALSTATE_MERGE_BUCKET_SOURCE_ONLY_COPY_CHANGED("vds.idealstate.merge_bucket.source_only_copy_changed", Unit.OPERATION, "The number of merge operations where source-only copy changed"), + VDS_IDEALSTATE_MERGE_BUCKET_SOURCE_ONLY_COPY_DELETE_BLOCKED("vds.idealstate.merge_bucket.source_only_copy_delete_blocked", Unit.OPERATION, "The number of merge operations where delete of unchanged source-only copies was blocked"), + VDS_IDEALSTATE_MERGE_BUCKET_SOURCE_ONLY_COPY_DELETE_FAILED("vds.idealstate.merge_bucket.source_only_copy_delete_failed", Unit.OPERATION, "The number of merge operations where delete of unchanged source-only copies failed"), + VDS_IDEALSTATE_SPLIT_BUCKET_DONE_OK("vds.idealstate.split_bucket.done_ok", Unit.OPERATION, "The number of operations successfully performed"), + VDS_IDEALSTATE_SPLIT_BUCKET_DONE_FAILED("vds.idealstate.split_bucket.done_failed", Unit.OPERATION, "The number of operations that failed"), + VDS_IDEALSTATE_SPLIT_BUCKET_PENDING("vds.idealstate.split_bucket.pending", Unit.OPERATION, "The number of operations pending"), + VDS_IDEALSTATE_SPLIT_BUCKET_BLOCKED("vds.idealstate.split_bucket.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"), + VDS_IDEALSTATE_SPLIT_BUCKET_THROTTLED("vds.idealstate.split_bucket.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"), + VDS_IDEALSTATE_JOIN_BUCKET_DONE_OK("vds.idealstate.join_bucket.done_ok", Unit.OPERATION, "The number of operations successfully performed"), + VDS_IDEALSTATE_JOIN_BUCKET_DONE_FAILED("vds.idealstate.join_bucket.done_failed", Unit.OPERATION, "The number of operations that failed"), + VDS_IDEALSTATE_JOIN_BUCKET_PENDING("vds.idealstate.join_bucket.pending", Unit.OPERATION, "The number of operations pending"), + VDS_IDEALSTATE_JOIN_BUCKET_BLOCKED("vds.idealstate.join_bucket.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"), + VDS_IDEALSTATE_JOIN_BUCKET_THROTTLED("vds.idealstate.join_bucket.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"), + VDS_IDEALSTATE_GARBAGE_COLLECTION_DONE_OK("vds.idealstate.garbage_collection.done_ok", Unit.OPERATION, "The number of operations successfully performed"), + VDS_IDEALSTATE_GARBAGE_COLLECTION_DONE_FAILED("vds.idealstate.garbage_collection.done_failed", Unit.OPERATION, "The number of operations that failed"), + VDS_IDEALSTATE_GARBAGE_COLLECTION_PENDING("vds.idealstate.garbage_collection.pending", Unit.OPERATION, "The number of operations pending"), + VDS_IDEALSTATE_GARBAGE_COLLECTION_DOCUMENTS_REMOVED("vds.idealstate.garbage_collection.documents_removed", Unit.DOCUMENT, "Number of documents removed by GC operations"), + VDS_IDEALSTATE_GARBAGE_COLLECTION_BLOCKED("vds.idealstate.garbage_collection.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"), + VDS_IDEALSTATE_GARBAGE_COLLECTION_THROTTLED("vds.idealstate.garbage_collection.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"), + + VDS_DISTRIBUTOR_PUTS_LATENCY("vds.distributor.puts.latency", Unit.MILLISECOND, "The latency of put operations"), + VDS_DISTRIBUTOR_PUTS_OK("vds.distributor.puts.ok", Unit.OPERATION, "The number of successful put operations performed"), + VDS_DISTRIBUTOR_PUTS_FAILURES_TOTAL("vds.distributor.puts.failures.total", Unit.OPERATION, "Sum of all failures"), + VDS_DISTRIBUTOR_PUTS_FAILURES_NOTFOUND("vds.distributor.puts.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_PUTS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.puts.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_PUTS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.puts.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_PUTS_FAILURES_NOTCONNECTED("vds.distributor.puts.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_PUTS_FAILURES_NOTREADY("vds.distributor.puts.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_PUTS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.puts.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_PUTS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.puts.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_PUTS_FAILURES_STORAGEFAILURE("vds.distributor.puts.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_PUTS_FAILURES_TIMEOUT("vds.distributor.puts.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_PUTS_FAILURES_BUSY("vds.distributor.puts.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_PUTS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.puts.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_REMOVES_LATENCY("vds.distributor.removes.latency", Unit.MILLISECOND, "The latency of remove operations"), + VDS_DISTRIBUTOR_REMOVES_OK("vds.distributor.removes.ok", Unit.OPERATION, "The number of successful removes operations performed"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_TOTAL("vds.distributor.removes.failures.total", Unit.OPERATION, "Sum of all failures"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_NOTFOUND("vds.distributor.removes.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_TEST_AND_SET_FAILED("vds.distributor.removes.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.removes.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_BUSY("vds.distributor.removes.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_INCONSISTENT_BUCKET("vds.distributor.removes.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_NOTCONNECTED("vds.distributor.removes.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_NOTREADY("vds.distributor.removes.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.removes.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_STORAGEFAILURE("vds.distributor.removes.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_TIMEOUT("vds.distributor.removes.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_REMOVES_FAILURES_WRONGDISTRIBUTOR("vds.distributor.removes.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_UPDATES_LATENCY("vds.distributor.updates.latency", Unit.MILLISECOND, "The latency of update operations"), + VDS_DISTRIBUTOR_UPDATES_OK("vds.distributor.updates.ok", Unit.OPERATION, "The number of successful updates operations performed"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_TOTAL("vds.distributor.updates.failures.total", Unit.OPERATION, "Sum of all failures"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_NOTFOUND("vds.distributor.updates.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_TEST_AND_SET_FAILED("vds.distributor.updates.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.updates.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_UPDATES_DIVERGING_TIMESTAMP_UPDATES("vds.distributor.updates.diverging_timestamp_updates", Unit.OPERATION, "Number of updates that report they were performed against divergent version timestamps on different replicas"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_BUSY("vds.distributor.updates.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_INCONSISTENT_BUCKET("vds.distributor.updates.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_NOTCONNECTED("vds.distributor.updates.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_NOTREADY("vds.distributor.updates.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.updates.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_STORAGEFAILURE("vds.distributor.updates.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_TIMEOUT("vds.distributor.updates.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_UPDATES_FAILURES_WRONGDISTRIBUTOR("vds.distributor.updates.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_UPDATES_FAST_PATH_RESTARTS("vds.distributor.updates.fast_path_restarts", Unit.OPERATION, "Number of safe path (write repair) updates that were restarted as fast path updates because all replicas returned documents with the same timestamp in the initial read phase"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_OK("vds.distributor.removelocations.ok", Unit.OPERATION, "The number of successful removelocations operations performed"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_TOTAL("vds.distributor.removelocations.failures.total", Unit.OPERATION, "Sum of all failures"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_BUSY("vds.distributor.removelocations.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.removelocations.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.removelocations.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_NOTCONNECTED("vds.distributor.removelocations.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_NOTFOUND("vds.distributor.removelocations.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_NOTREADY("vds.distributor.removelocations.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.removelocations.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_STORAGEFAILURE("vds.distributor.removelocations.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.removelocations.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_TIMEOUT("vds.distributor.removelocations.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.removelocations.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_REMOVELOCATIONS_LATENCY("vds.distributor.removelocations.latency", Unit.MILLISECOND, "The average latency of removelocations operations"), + VDS_DISTRIBUTOR_GETS_LATENCY("vds.distributor.gets.latency", Unit.MILLISECOND, "The average latency of gets operations"), + VDS_DISTRIBUTOR_GETS_OK("vds.distributor.gets.ok", Unit.OPERATION, "The number of successful gets operations performed"), + VDS_DISTRIBUTOR_GETS_FAILURES_TOTAL("vds.distributor.gets.failures.total", Unit.OPERATION, "Sum of all failures"), + VDS_DISTRIBUTOR_GETS_FAILURES_NOTFOUND("vds.distributor.gets.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_GETS_FAILURES_BUSY("vds.distributor.gets.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_GETS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.gets.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_GETS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.gets.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_GETS_FAILURES_NOTCONNECTED("vds.distributor.gets.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_GETS_FAILURES_NOTREADY("vds.distributor.gets.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_GETS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.gets.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_GETS_FAILURES_STORAGEFAILURE("vds.distributor.gets.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_GETS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.gets.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_GETS_FAILURES_TIMEOUT("vds.distributor.gets.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_GETS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.gets.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_VISITOR_LATENCY("vds.distributor.visitor.latency", Unit.MILLISECOND, "The average latency of visitor operations"), + VDS_DISTRIBUTOR_VISITOR_OK("vds.distributor.visitor.ok", Unit.OPERATION, "The number of successful visitor operations performed"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_TOTAL("vds.distributor.visitor.failures.total", Unit.OPERATION, "Sum of all failures"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_NOTREADY("vds.distributor.visitor.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_NOTCONNECTED("vds.distributor.visitor.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_WRONGDISTRIBUTOR("vds.distributor.visitor.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.visitor.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_STORAGEFAILURE("vds.distributor.visitor.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_TIMEOUT("vds.distributor.visitor.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_BUSY("vds.distributor.visitor.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_INCONSISTENT_BUCKET("vds.distributor.visitor.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_NOTFOUND("vds.distributor.visitor.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_VISITOR_BYTES_PER_VISITOR("vds.distributor.visitor.bytes_per_visitor", Unit.OPERATION, "The number of bytes visited on content nodes as part of a single client visitor command"), + VDS_DISTRIBUTOR_VISITOR_DOCS_PER_VISITOR("vds.distributor.visitor.docs_per_visitor", Unit.OPERATION, "The number of documents visited on content nodes as part of a single client visitor command"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.visitor.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_VISITOR_FAILURES_TEST_AND_SET_FAILED("vds.distributor.visitor.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + + VDS_DISTRIBUTOR_DOCSSTORED("vds.distributor.docsstored", Unit.DOCUMENT, "Number of documents stored in all buckets controlled by this distributor"), + VDS_DISTRIBUTOR_BYTESSTORED("vds.distributor.bytesstored", Unit.BYTE, "Number of bytes stored in all buckets controlled by this distributor"), + + METRICMANAGER_PERIODICHOOKLATENCY("metricmanager.periodichooklatency", Unit.MILLISECOND, "Time in ms used to update a single periodic hook"), + METRICMANAGER_RESETLATENCY("metricmanager.resetlatency", Unit.MILLISECOND, "Time in ms used to reset all metrics."), + METRICMANAGER_SLEEPTIME("metricmanager.sleeptime", Unit.MILLISECOND, "Time in ms worker thread is sleeping"), + METRICMANAGER_SNAPSHOTHOOKLATENCY("metricmanager.snapshothooklatency", Unit.MILLISECOND, "Time in ms used to update a single snapshot hook"), + METRICMANAGER_SNAPSHOTLATENCY("metricmanager.snapshotlatency", Unit.MILLISECOND, "Time in ms used to take a snapshot"), + VDS_DISTRIBUTOR_ACTIVATE_CLUSTER_STATE_PROCESSING_TIME("vds.distributor.activate_cluster_state_processing_time", Unit.MILLISECOND, "Elapsed time where the distributor thread is blocked on merging pending bucket info into its bucket database upon activating a cluster state"), + VDS_DISTRIBUTOR_BUCKET_DB_MEMORY_USAGE_ALLOCATED_BYTES("vds.distributor.bucket_db.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + VDS_DISTRIBUTOR_BUCKET_DB_MEMORY_USAGE_DEAD_BYTES("vds.distributor.bucket_db.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + VDS_DISTRIBUTOR_BUCKET_DB_MEMORY_USAGE_ONHOLD_BYTES("vds.distributor.bucket_db.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + VDS_DISTRIBUTOR_BUCKET_DB_MEMORY_USAGE_USED_BYTES("vds.distributor.bucket_db.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_BUSY("vds.distributor.getbucketlists.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.getbucketlists.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.getbucketlists.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_NOTCONNECTED("vds.distributor.getbucketlists.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_NOTFOUND("vds.distributor.getbucketlists.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_NOTREADY("vds.distributor.getbucketlists.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.getbucketlists.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_STORAGEFAILURE("vds.distributor.getbucketlists.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.getbucketlists.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_TIMEOUT("vds.distributor.getbucketlists.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_TOTAL("vds.distributor.getbucketlists.failures.total", Unit.OPERATION, "Total number of failures"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.getbucketlists.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_LATENCY("vds.distributor.getbucketlists.latency", Unit.MILLISECOND, "The average latency of getbucketlists operations"), + VDS_DISTRIBUTOR_GETBUCKETLISTS_OK("vds.distributor.getbucketlists.ok", Unit.OPERATION, "The number of successful getbucketlists operations performed"), + VDS_DISTRIBUTOR_RECOVERYMODESCHEDULINGTIME("vds.distributor.recoverymodeschedulingtime", Unit.MILLISECOND, "Time spent scheduling operations in recovery mode after receiving new cluster state"), + VDS_DISTRIBUTOR_SET_CLUSTER_STATE_PROCESSING_TIME("vds.distributor.set_cluster_state_processing_time", Unit.MILLISECOND, "Elapsed time where the distributor thread is blocked on processing its bucket database upon receiving a new cluster state"), + VDS_DISTRIBUTOR_STATE_TRANSITION_TIME("vds.distributor.state_transition_time", Unit.MILLISECOND, "Time it takes to complete a cluster state transition. If a state transition is preempted before completing, its elapsed time is counted as part of the total time spent for the final, completed state transition"), + VDS_DISTRIBUTOR_STATS_FAILURES_BUSY("vds.distributor.stats.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_STATS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.stats.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_STATS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.stats.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_STATS_FAILURES_NOTCONNECTED("vds.distributor.stats.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_STATS_FAILURES_NOTFOUND("vds.distributor.stats.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_STATS_FAILURES_NOTREADY("vds.distributor.stats.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_STATS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.stats.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_STATS_FAILURES_STORAGEFAILURE("vds.distributor.stats.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_STATS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.stats.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_STATS_FAILURES_TIMEOUT("vds.distributor.stats.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_STATS_FAILURES_TOTAL("vds.distributor.stats.failures.total", Unit.OPERATION, "The total number of failures"), + VDS_DISTRIBUTOR_STATS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.stats.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_STATS_LATENCY("vds.distributor.stats.latency", Unit.MILLISECOND, "The average latency of stats operations"), + VDS_DISTRIBUTOR_STATS_OK("vds.distributor.stats.ok", Unit.OPERATION, "The number of successful stats operations performed"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_BUSY("vds.distributor.update_gets.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.update_gets.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.update_gets.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_NOTCONNECTED("vds.distributor.update_gets.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_NOTFOUND("vds.distributor.update_gets.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_NOTREADY("vds.distributor.update_gets.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.update_gets.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_STORAGEFAILURE("vds.distributor.update_gets.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.update_gets.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_TIMEOUT("vds.distributor.update_gets.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_TOTAL("vds.distributor.update_gets.failures.total", Unit.OPERATION, "The total number of failures"), + VDS_DISTRIBUTOR_UPDATE_GETS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.update_gets.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_UPDATE_GETS_LATENCY("vds.distributor.update_gets.latency", Unit.MILLISECOND, "The average latency of update_gets operations"), + VDS_DISTRIBUTOR_UPDATE_GETS_OK("vds.distributor.update_gets.ok", Unit.OPERATION, "The number of successful update_gets operations performed"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_BUSY("vds.distributor.update_metadata_gets.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.update_metadata_gets.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.update_metadata_gets.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_NOTCONNECTED("vds.distributor.update_metadata_gets.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_NOTFOUND("vds.distributor.update_metadata_gets.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_NOTREADY("vds.distributor.update_metadata_gets.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.update_metadata_gets.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_STORAGEFAILURE("vds.distributor.update_metadata_gets.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.update_metadata_gets.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_TIMEOUT("vds.distributor.update_metadata_gets.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_TOTAL("vds.distributor.update_metadata_gets.failures.total", Unit.OPERATION, "The total number of failures"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.update_metadata_gets.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_LATENCY("vds.distributor.update_metadata_gets.latency", Unit.MILLISECOND, "The average latency of update_metadata_gets operations"), + VDS_DISTRIBUTOR_UPDATE_METADATA_GETS_OK("vds.distributor.update_metadata_gets.ok", Unit.OPERATION, "The number of successful update_metadata_gets operations performed"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_BUSY("vds.distributor.update_puts.failures.busy", Unit.OPERATION, "The number of messages from storage that failed because the storage node was busy"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_CONCURRENT_MUTATIONS("vds.distributor.update_puts.failures.concurrent_mutations", Unit.OPERATION, "The number of operations that were transiently failed due to a mutating operation already being in progress for its document ID"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_INCONSISTENT_BUCKET("vds.distributor.update_puts.failures.inconsistent_bucket", Unit.OPERATION, "The number of operations failed due to buckets being in an inconsistent state or not found"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_NOTCONNECTED("vds.distributor.update_puts.failures.notconnected", Unit.OPERATION, "The number of operations discarded because there were no available storage nodes to send to"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_NOTFOUND("vds.distributor.update_puts.failures.notfound", Unit.OPERATION, "The number of operations that failed because the document did not exist"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_NOTREADY("vds.distributor.update_puts.failures.notready", Unit.OPERATION, "The number of operations discarded because distributor was not ready"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_SAFE_TIME_NOT_REACHED("vds.distributor.update_puts.failures.safe_time_not_reached", Unit.OPERATION, "The number of operations that were transiently failed due to them arriving before the safe time point for bucket ownership handovers has passed"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_STORAGEFAILURE("vds.distributor.update_puts.failures.storagefailure", Unit.OPERATION, "The number of operations that failed in storage"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_TEST_AND_SET_FAILED("vds.distributor.update_puts.failures.test_and_set_failed", Unit.OPERATION, "The number of mutating operations that failed because they specified a test-and-set condition that did not match the existing document"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_TIMEOUT("vds.distributor.update_puts.failures.timeout", Unit.OPERATION, "The number of operations that failed because the operation timed out towards storage"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_TOTAL("vds.distributor.update_puts.failures.total", Unit.OPERATION, "The total number of put failures"), + VDS_DISTRIBUTOR_UPDATE_PUTS_FAILURES_WRONGDISTRIBUTOR("vds.distributor.update_puts.failures.wrongdistributor", Unit.OPERATION, "The number of operations discarded because they were sent to the wrong distributor"), + VDS_DISTRIBUTOR_UPDATE_PUTS_LATENCY("vds.distributor.update_puts.latency", Unit.MILLISECOND, "The average latency of update_puts operations"), + VDS_DISTRIBUTOR_UPDATE_PUTS_OK("vds.distributor.update_puts.ok", Unit.OPERATION, "The number of successful update_puts operations performed"), + + VDS_IDEALSTATE_NODES_PER_MERGE("vds.idealstate.nodes_per_merge", Unit.NODE, "The number of nodes involved in a single merge operation."), + VDS_IDEALSTATE_SET_BUCKET_STATE_BLOCKED("vds.idealstate.set_bucket_state.blocked", Unit.OPERATION, "The number of operations blocked by blocking operation starter"), + VDS_IDEALSTATE_SET_BUCKET_STATE_DONE_FAILED("vds.idealstate.set_bucket_state.done_failed", Unit.OPERATION, "The number of operations that failed"), + VDS_IDEALSTATE_SET_BUCKET_STATE_DONE_OK("vds.idealstate.set_bucket_state.done_ok", Unit.OPERATION, "The number of operations successfully performed"), + VDS_IDEALSTATE_SET_BUCKET_STATE_PENDING("vds.idealstate.set_bucket_state.pending", Unit.OPERATION, "The number of operations pending"), + VDS_IDEALSTATE_SET_BUCKET_STATE_THROTTLED("vds.idealstate.set_bucket_state.throttled", Unit.OPERATION, "The number of operations throttled by throttling operation starter"), + + VDS_BOUNCER_CLOCK_SKEW_ABORTS("vds.bouncer.clock_skew_aborts", Unit.OPERATION, "Number of client operations that were aborted due to clock skew between sender and receiver exceeding acceptable range"); + + + private final String name; + private final Unit unit; + private final String description; + + DistributorMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/HostedNodeAdminMetrics.java b/metrics/src/main/java/ai/vespa/metrics/HostedNodeAdminMetrics.java new file mode 100644 index 00000000000..927672a43f7 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/HostedNodeAdminMetrics.java @@ -0,0 +1,61 @@ +package ai.vespa.metrics; + +/** + * @author yngveaasheim + */ + +// TODO: Move to hosted repo. +public enum HostedNodeAdminMetrics implements VespaMetrics { + + // System metrics + CPU_UTIL("cpu.util", Unit.PERCENTAGE, "CPU utilisation"), + CPU_SYS_UTIL("cpu.sys.util", Unit.PERCENTAGE, "System CPU utilisation"), + CPU_THROTTLED_TIME("cpu.throttled_time.rate", Unit.PERCENTAGE, "Part of the time CPU is exhausted (CPU throttling enforced)"), + CPU_THROTTLED_CPU_TIME("cpu.throttled_cpu_time.rate", Unit.PERCENTAGE, "Part of the time CPU is exhausted (CPU throttling enforced)"), + CPU_VCPUS("cpu.vcpus", Unit.ITEM, "Number of virtual CPU threads allocation to the node"), + DISK_LIMIT("disk.limit", Unit.BYTE, "Amount of disk space available on the node"), + DISK_USED("disk.used", Unit.BYTE, "Amount of disk space used by the node"), + DISK_UTIL("disk.util", Unit.PERCENTAGE, "Disk space utilisation"), + MEM_LIMIT("mem.limit", Unit.BYTE, "Amount of memory available on the node"), + MEM_USED("mem.used", Unit.BYTE, "Amount of memory used by the node"), + MEM_UTIL("mem.util", Unit.PERCENTAGE, "Memory utilisation"), + MEM_TOTAL_USED("mem_total.used", Unit.BYTE, "Total amount of memory used by the node, including OS buffer caches"), + MEM_TOTAL_UTIL("mem_total.util", Unit.PERCENTAGE, "Total memory utilisation"), + GPU_UTIL("gpu.util", Unit.PERCENTAGE, "GPU utilisation"), + GPU_MEM_USED("gpu.memory.used", Unit.BYTE, "GPU memory used"), + GPU_MEM_TOTAL("gpu.memory.total", Unit.BYTE, "GPU memory available"), + + + // Network metrics + NET_IN_BYTES("net.in.bytes", Unit.BYTE, "Network bytes received (rxBytes) (COUNT metric)"), + NET_IN_ERROR("net.in.errors", Unit.FAILURE, "Network receive errors (rxErrors)"), + NET_IN_DROPPED("net.in.dropped", Unit.PACKET, "Inbound network packets dropped (rxDropped)"), + NET_OUT_BYTES("net.in.bytes", Unit.BYTE, "Network bytes sent (txBytes) (COUNT metric)"), + NET_OUT_ERROR("net.in.errors", Unit.FAILURE, "Network send errors (txErrors)"), + NET_OUT_DROPPED("net.in.dropped", Unit.PACKET, "Outbound network packets dropped (txDropped)"), + BANDWIDTH_LIMIT("bandwidth.limit", Unit.BYTE_PER_SECOND, "Available network bandwidth"); + + private final String name; + private final Unit unit; + private final String description; + + HostedNodeAdminMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} + diff --git a/metrics/src/main/java/ai/vespa/metrics/LogdMetrics.java b/metrics/src/main/java/ai/vespa/metrics/LogdMetrics.java new file mode 100644 index 00000000000..79122e3b922 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/LogdMetrics.java @@ -0,0 +1,33 @@ +package ai.vespa.metrics; + +/** + * @author yngveaasheim + */ +public enum LogdMetrics implements VespaMetrics { + + LOGD_PROCESSED_LINES("logd.processed.lines", Unit.ITEM, "Number of log lines processed"); + + private final String name; + private final Unit unit; + private final String description; + + LogdMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} + diff --git a/metrics/src/main/java/ai/vespa/metrics/NodeAdminMetrics.java b/metrics/src/main/java/ai/vespa/metrics/NodeAdminMetrics.java new file mode 100644 index 00000000000..74da68dbcb7 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/NodeAdminMetrics.java @@ -0,0 +1,35 @@ +package ai.vespa.metrics; + +/** + * @author yngveaasheim + */ +public enum NodeAdminMetrics implements VespaMetrics { + + ENDPOINT_CERTIFICATE_EXPIRY_SECONDS("endpoint.certificate.expiry.seconds", Unit.SECOND, "Time until node endpoint certificate expires"), + NODE_CERTIFICATE_EXPIRY_SECONDS("node-certificate.expiry.seconds", Unit.SECOND, "Time until node certificate expires"); + + + private final String name; + private final Unit unit; + private final String description; + + NodeAdminMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} + diff --git a/metrics/src/main/java/ai/vespa/metrics/RoutingLayerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/RoutingLayerMetrics.java new file mode 100644 index 00000000000..cf35cdae90e --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/RoutingLayerMetrics.java @@ -0,0 +1,34 @@ +package ai.vespa.metrics; + +/** + * @author yngveaasheim + */ + +// Internal hosted Vespa only TODO: Move to a better place +public enum RoutingLayerMetrics implements VespaMetrics { + + WORKER_CONNECTIONS("worker.connections", Unit.CONNECTION, "Yahoo! Internal: Number of connections for the routing worker having most connections per node"); + + private final String name; + private final Unit unit; + private final String description; + + RoutingLayerMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/SearchNodeMetrics.java b/metrics/src/main/java/ai/vespa/metrics/SearchNodeMetrics.java new file mode 100644 index 00000000000..d6018dc0633 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/SearchNodeMetrics.java @@ -0,0 +1,299 @@ +package ai.vespa.metrics; + +/** + * @author gjoranv + */ +public enum SearchNodeMetrics implements VespaMetrics { + + CONTENT_PROTON_CONFIG_GENERATION("content.proton.config.generation", Unit.VERSION, "The oldest config generation used by this search node"), + + CONTENT_PROTON_DOCUMENTDB_DOCUMENTS_TOTAL("content.proton.documentdb.documents.total", Unit.DOCUMENT, "The total number of documents in this documents db (ready + not-ready)"), + CONTENT_PROTON_DOCUMENTDB_DOCUMENTS_READY("content.proton.documentdb.documents.ready", Unit.DOCUMENT, "The number of ready documents in this document db"), + CONTENT_PROTON_DOCUMENTDB_DOCUMENTS_ACTIVE("content.proton.documentdb.documents.active", Unit.DOCUMENT, "The number of active / searchable documents in this document db"), + CONTENT_PROTON_DOCUMENTDB_DOCUMENTS_REMOVED("content.proton.documentdb.documents.removed", Unit.DOCUMENT, "The number of removed documents in this document db"), + + CONTENT_PROTON_DOCUMENTDB_INDEX_DOCS_IN_MEMORY("content.proton.documentdb.index.docs_in_memory", Unit.DOCUMENT, "Number of documents in memory index"), + CONTENT_PROTON_DOCUMENTDB_DISK_USAGE("content.proton.documentdb.disk_usage", Unit.BYTE, "The total disk usage (in bytes) for this document db"), + CONTENT_PROTON_DOCUMENTDB_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + CONTENT_PROTON_DOCUMENTDB_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + CONTENT_PROTON_DOCUMENTDB_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + CONTENT_PROTON_DOCUMENTDB_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + CONTENT_PROTON_DOCUMENTDB_HEART_BEAT_AGE("content.proton.documentdb.heart_beat_age", Unit.SECOND, "How long ago (in seconds) heart beat maintenace job was run"), + CONTENT_PROTON_DOCSUM_COUNT("content.proton.docsum.count", Unit.REQUEST, "Docsum requests handled"), + CONTENT_PROTON_DOCSUM_DOCS("content.proton.docsum.docs", Unit.DOCUMENT, "Total docsums returned"), + CONTENT_PROTON_DOCSUM_LATENCY("content.proton.docsum.latency", Unit.MILLISECOND, "Docsum request latency"), + + // Search protocol + CONTENT_PROTON_SEARCH_PROTOCOL_QUERY_LATENCY("content.proton.search_protocol.query.latency", Unit.SECOND, "Query request latency (seconds)"), + CONTENT_PROTON_SEARCH_PROTOCOL_QUERY_REQUEST_SIZE("content.proton.search_protocol.query.request_size", Unit.BYTE, "Query request size (network bytes)"), + CONTENT_PROTON_SEARCH_PROTOCOL_QUERY_REPLY_SIZE("content.proton.search_protocol.query.reply_size", Unit.BYTE, "Query reply size (network bytes)"), + CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_LATENCY("content.proton.search_protocol.docsum.latency", Unit.SECOND, "Docsum request latency (seconds)"), + CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_REQUEST_SIZE("content.proton.search_protocol.docsum.request_size", Unit.BYTE, "Docsum request size (network bytes)"), + CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_REPLY_SIZE("content.proton.search_protocol.docsum.reply_size", Unit.BYTE, "Docsum reply size (network bytes)"), + CONTENT_PROTON_SEARCH_PROTOCOL_DOCSUM_REQUESTED_DOCUMENTS("content.proton.search_protocol.docsum.requested_documents", Unit.DOCUMENT, "Total requested document summaries"), + + // Executors shared between all document dbs + CONTENT_PROTON_EXECUTOR_PROTON_QUEUESIZE("content.proton.executor.proton.queuesize", Unit.TASK, "Size of executor proton task queue"), + CONTENT_PROTON_EXECUTOR_PROTON_ACCEPTED("content.proton.executor.proton.accepted", Unit.TASK, "Number of executor proton accepted tasks"), + CONTENT_PROTON_EXECUTOR_PROTON_WAKEUPS("content.proton.executor.proton.wakeups", Unit.WAKEUP, "Number of times a executor proton worker thread has been woken up"), + CONTENT_PROTON_EXECUTOR_PROTON_UTILIZATION("content.proton.executor.proton.utilization", Unit.FRACTION, "Ratio of time the executor proton worker threads has been active"), + CONTENT_PROTON_EXECUTOR_PROTON_REJECTED("content.proton.executor.proton.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_EXECUTOR_FLUSH_QUEUESIZE("content.proton.executor.flush.queuesize", Unit.TASK, "Size of executor flush task queue"), + CONTENT_PROTON_EXECUTOR_FLUSH_ACCEPTED("content.proton.executor.flush.accepted", Unit.TASK, "Number of accepted executor flush tasks"), + CONTENT_PROTON_EXECUTOR_FLUSH_WAKEUPS("content.proton.executor.flush.wakeups", Unit.WAKEUP, "Number of times a executor flush worker thread has been woken up"), + CONTENT_PROTON_EXECUTOR_FLUSH_UTILIZATION("content.proton.executor.flush.utilization", Unit.FRACTION, "Ratio of time the executor flush worker threads has been active"), + CONTENT_PROTON_EXECUTOR_FLUSH_REJECTED("content.proton.executor.flush.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_EXECUTOR_MATCH_QUEUESIZE("content.proton.executor.match.queuesize", Unit.TASK, "Size of executor match task queue"), + CONTENT_PROTON_EXECUTOR_MATCH_ACCEPTED("content.proton.executor.match.accepted", Unit.TASK, "Number of accepted executor match tasks"), + CONTENT_PROTON_EXECUTOR_MATCH_WAKEUPS("content.proton.executor.match.wakeups", Unit.WAKEUP, "Number of times a executor match worker thread has been woken up"), + CONTENT_PROTON_EXECUTOR_MATCH_UTILIZATION("content.proton.executor.match.utilization", Unit.FRACTION, "Ratio of time the executor match worker threads has been active"), + CONTENT_PROTON_EXECUTOR_MATCH_REJECTED("content.proton.executor.match.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_EXECUTOR_DOCSUM_QUEUESIZE("content.proton.executor.docsum.queuesize", Unit.TASK, "Size of executor docsum task queue"), + CONTENT_PROTON_EXECUTOR_DOCSUM_ACCEPTED("content.proton.executor.docsum.accepted", Unit.TASK, "Number of executor accepted docsum tasks"), + CONTENT_PROTON_EXECUTOR_DOCSUM_WAKEUPS("content.proton.executor.docsum.wakeups", Unit.WAKEUP, "Number of times a executor docsum worker thread has been woken up"), + CONTENT_PROTON_EXECUTOR_DOCSUM_UTILIZATION("content.proton.executor.docsum.utilization", Unit.FRACTION, "Ratio of time the executor docsum worker threads has been active"), + CONTENT_PROTON_EXECUTOR_DOCSUM_REJECTED("content.proton.executor.docsum.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_EXECUTOR_SHARED_QUEUESIZE("content.proton.executor.shared.queuesize", Unit.TASK, "Size of executor shared task queue"), + CONTENT_PROTON_EXECUTOR_SHARED_ACCEPTED("content.proton.executor.shared.accepted", Unit.TASK, "Number of executor shared accepted tasks"), + CONTENT_PROTON_EXECUTOR_SHARED_WAKEUPS("content.proton.executor.shared.wakeups", Unit.WAKEUP, "Number of times a executor shared worker thread has been woken up"), + CONTENT_PROTON_EXECUTOR_SHARED_UTILIZATION("content.proton.executor.shared.utilization", Unit.FRACTION, "Ratio of time the executor shared worker threads has been active"), + CONTENT_PROTON_EXECUTOR_SHARED_REJECTED("content.proton.executor.shared.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_EXECUTOR_WARMUP_QUEUESIZE("content.proton.executor.warmup.queuesize", Unit.TASK, "Size of executor warmup task queue"), + CONTENT_PROTON_EXECUTOR_WARMUP_ACCEPTED("content.proton.executor.warmup.accepted", Unit.TASK, "Number of accepted executor warmup tasks"), + CONTENT_PROTON_EXECUTOR_WARMUP_WAKEUPS("content.proton.executor.warmup.wakeups", Unit.WAKEUP, "Number of times a warmup executor worker thread has been woken up"), + CONTENT_PROTON_EXECUTOR_WARMUP_UTILIZATION("content.proton.executor.warmup.utilization", Unit.FRACTION, "Ratio of time the executor warmup worker threads has been active"), + CONTENT_PROTON_EXECUTOR_WARMUP_REJECTED("content.proton.executor.warmup.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_EXECUTOR_FIELD_WRITER_QUEUESIZE("content.proton.executor.field_writer.queuesize", Unit.TASK, "Size of executor field writer task queue"), + CONTENT_PROTON_EXECUTOR_FIELD_WRITER_ACCEPTED("content.proton.executor.field_writer.accepted", Unit.TASK, "Number of accepted executor field writer tasks"), + CONTENT_PROTON_EXECUTOR_FIELD_WRITER_WAKEUPS("content.proton.executor.field_writer.wakeups", Unit.WAKEUP, "Number of times a executor field writer worker thread has been woken up"), + CONTENT_PROTON_EXECUTOR_FIELD_WRITER_UTILIZATION("content.proton.executor.field_writer.utilization", Unit.FRACTION, "Ratio of time the executor fieldwriter worker threads has been active"), + CONTENT_PROTON_EXECUTOR_FIELD_WRITER_REJECTED("content.proton.executor.field_writer.rejected", Unit.TASK, "Number of rejected tasks"), + + // jobs + CONTENT_PROTON_DOCUMENTDB_JOB_TOTAL("content.proton.documentdb.job.total", Unit.FRACTION, "The job load average total of all job metrics"), + CONTENT_PROTON_DOCUMENTDB_JOB_ATTRIBUTE_FLUSH("content.proton.documentdb.job.attribute_flush", Unit.FRACTION, "Flushing of attribute vector(s) to disk"), + CONTENT_PROTON_DOCUMENTDB_JOB_MEMORY_INDEX_FLUSH("content.proton.documentdb.job.memory_index_flush", Unit.FRACTION, "Flushing of memory index to disk"), + CONTENT_PROTON_DOCUMENTDB_JOB_DISK_INDEX_FUSION("content.proton.documentdb.job.disk_index_fusion", Unit.FRACTION, "Fusion of disk indexes"), + CONTENT_PROTON_DOCUMENTDB_JOB_DOCUMENT_STORE_FLUSH("content.proton.documentdb.job.document_store_flush", Unit.FRACTION, "Flushing of document store to disk"), + CONTENT_PROTON_DOCUMENTDB_JOB_DOCUMENT_STORE_COMPACT("content.proton.documentdb.job.document_store_compact", Unit.FRACTION, "Compaction of document store on disk"), + CONTENT_PROTON_DOCUMENTDB_JOB_BUCKET_MOVE("content.proton.documentdb.job.bucket_move", Unit.FRACTION, "Moving of buckets between 'ready' and 'notready' sub databases"), + CONTENT_PROTON_DOCUMENTDB_JOB_LID_SPACE_COMPACT("content.proton.documentdb.job.lid_space_compact", Unit.FRACTION, "Compaction of lid space in document meta store and attribute vectors"), + CONTENT_PROTON_DOCUMENTDB_JOB_REMOVED_DOCUMENTS_PRUNE("content.proton.documentdb.job.removed_documents_prune", Unit.FRACTION, "Pruning of removed documents in 'removed' sub database"), + + // Threading service (per document db) + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_QUEUESIZE("content.proton.documentdb.threading_service.master.queuesize", Unit.TASK, "Size of threading service master task queue"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_ACCEPTED("content.proton.documentdb.threading_service.master.accepted", Unit.TASK, "Number of accepted threading service master tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_WAKEUPS("content.proton.documentdb.threading_service.master.wakeups", Unit.WAKEUP, "Number of times a threading service master worker thread has been woken up"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_UTILIZATION("content.proton.documentdb.threading_service.master.utilization", Unit.FRACTION, "Ratio of time the threading service master worker threads has been active"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_MASTER_REJECTED("content.proton.documentdb.threading_service.master.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_QUEUESIZE("content.proton.documentdb.threading_service.index.queuesize", Unit.TASK, "Size of threading service index task queue"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_ACCEPTED("content.proton.documentdb.threading_service.index.accepted", Unit.TASK, "Number of accepted threading service index tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_WAKEUPS("content.proton.documentdb.threading_service.index.wakeups", Unit.WAKEUP, "Number of times a threading service index worker thread has been woken up"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_UTILIZATION("content.proton.documentdb.threading_service.index.utilization", Unit.FRACTION, "Ratio of time the threading service index worker threads has been active"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_REJECTED("content.proton.documentdb.threading_service.index.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_QUEUESIZE("content.proton.documentdb.threading_service.summary.queuesize", Unit.TASK, "Size of threading service summary task queue"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_ACCEPTED("content.proton.documentdb.threading_service.summary.accepted", Unit.TASK, "Number of accepted threading service summary tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_WAKEUPS("content.proton.documentdb.threading_service.summary.wakeups", Unit.WAKEUP, "Number of times a threading service summary worker thread has been woken up"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_UTILIZATION("content.proton.documentdb.threading_service.summary.utilization", Unit.FRACTION, "Ratio of time the threading service summary worker threads has been active"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_SUMMARY_REJECTED("content.proton.documentdb.threading_service.summary.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_ACCEPTED("content.proton.documentdb.threading_service.attribute_field_writer.accepted", Unit.TASK, "Number of accepted tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_QUEUESIZE("content.proton.documentdb.threading_service.attribute_field_writer.queuesize", Unit.TASK, "Size of task queue"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_REJECTED("content.proton.documentdb.threading_service.attribute_field_writer.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_UTILIZATION("content.proton.documentdb.threading_service.attribute_field_writer.utilization", Unit.FRACTION, "Ratio of time the worker threads has been active"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_ATTRIBUTE_FIELD_WRITER_WAKEUPS("content.proton.documentdb.threading_service.attribute_field_writer.wakeups", Unit.WAKEUP, "Number of times a worker thread has been woken up"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_ACCEPTED("content.proton.documentdb.threading_service.index_field_inverter.accepted", Unit.TASK, "Number of accepted tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_QUEUESIZE("content.proton.documentdb.threading_service.index_field_inverter.queuesize", Unit.TASK, "Size of task queue"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_REJECTED("content.proton.documentdb.threading_service.index_field_inverter.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_UTILIZATION("content.proton.documentdb.threading_service.index_field_inverter.utilization", Unit.FRACTION, "Ratio of time the worker threads has been active"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_INVERTER_WAKEUPS("content.proton.documentdb.threading_service.index_field_inverter.wakeups", Unit.WAKEUP, "Number of times a worker thread has been woken up"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_ACCEPTED("content.proton.documentdb.threading_service.index_field_writer.accepted", Unit.TASK, "Number of accepted tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_QUEUESIZE("content.proton.documentdb.threading_service.index_field_writer.queuesize", Unit.TASK, "Size of task queue"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_REJECTED("content.proton.documentdb.threading_service.index_field_writer.rejected", Unit.TASK, "Number of rejected tasks"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_UTILIZATION("content.proton.documentdb.threading_service.index_field_writer.utilization", Unit.FRACTION, "Ratio of time the worker threads has been active"), + CONTENT_PROTON_DOCUMENTDB_THREADING_SERVICE_INDEX_FIELD_WRITER_WAKEUPS("content.proton.documentdb.threading_service.index_field_writer.wakeups", Unit.WAKEUP, "Number of times a worker thread has been woken up"), + + // lid space + CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_LID_BLOAT_FACTOR("content.proton.documentdb.ready.lid_space.lid_bloat_factor", Unit.FRACTION, "The bloat factor of this lid space, indicating the total amount of holes in the allocated lid space ((lid_limit - used_lids) / lid_limit)"), + CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_LID_FRAGMENTATION_FACTOR("content.proton.documentdb.ready.lid_space.lid_fragmentation_factor", Unit.FRACTION, "The fragmentation factor of this lid space, indicating the amount of holes in the currently used part of the lid space ((highest_used_lid - used_lids) / highest_used_lid)"), + CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_LID_LIMIT("content.proton.documentdb.ready.lid_space.lid_limit", Unit.DOCUMENTID, "The size of the allocated lid space"), + CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_HIGHEST_USED_LID("content.proton.documentdb.ready.lid_space.highest_used_lid", Unit.DOCUMENTID, "The highest used lid"), + CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_USED_LIDS("content.proton.documentdb.ready.lid_space.used_lids", Unit.DOCUMENTID, "The number of lids used"), + CONTENT_PROTON_DOCUMENTDB_READY_LID_SPACE_LOWEST_FREE_LID("content.proton.documentdb.ready.lid_space.lowest_free_lid", Unit.DOCUMENTID, "The lowest free local document id"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_LID_BLOAT_FACTOR("content.proton.documentdb.notready.lid_space.lid_bloat_factor", Unit.FRACTION, "The bloat factor of this lid space, indicating the total amount of holes in the allocated lid space ((lid_limit - used_lids) / lid_limit)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_LID_FRAGMENTATION_FACTOR("content.proton.documentdb.notready.lid_space.lid_fragmentation_factor", Unit.FRACTION, "The fragmentation factor of this lid space, indicating the amount of holes in the currently used part of the lid space ((highest_used_lid - used_lids) / highest_used_lid)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_LID_LIMIT("content.proton.documentdb.notready.lid_space.lid_limit", Unit.DOCUMENTID, "The size of the allocated lid space"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_HIGHEST_USED_LID("content.proton.documentdb.notready.lid_space.highest_used_lid", Unit.DOCUMENTID, "The highest used lid"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_USED_LIDS("content.proton.documentdb.notready.lid_space.used_lids", Unit.DOCUMENTID, "The number of lids used"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_LID_SPACE_LOWEST_FREE_LID("content.proton.documentdb.notready.lid_space.lowest_free_lid", Unit.DOCUMENTID, "The lowest free local document id"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_LID_BLOAT_FACTOR("content.proton.documentdb.removed.lid_space.lid_bloat_factor", Unit.FRACTION, "The bloat factor of this lid space, indicating the total amount of holes in the allocated lid space ((lid_limit - used_lids) / lid_limit)"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_LID_FRAGMENTATION_FACTOR("content.proton.documentdb.removed.lid_space.lid_fragmentation_factor", Unit.FRACTION, "The fragmentation factor of this lid space, indicating the amount of holes in the currently used part of the lid space ((highest_used_lid - used_lids) / highest_used_lid)"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_LID_LIMIT("content.proton.documentdb.removed.lid_space.lid_limit", Unit.DOCUMENTID, "The size of the allocated lid space"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_HIGHEST_USED_LID("content.proton.documentdb.removed.lid_space.highest_used_lid", Unit.DOCUMENTID, "The highest used lid"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_USED_LIDS("content.proton.documentdb.removed.lid_space.used_lids", Unit.DOCUMENTID, "The number of lids used"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_LID_SPACE_LOWEST_FREE_LID("content.proton.documentdb.removed.lid_space.lowest_free_lid", Unit.DOCUMENTID, "The lowest free local document id"), + + // bucket move + CONTENT_PROTON_DOCUMENTDB_BUCKET_MOVE_BUCKETS_PENDING("content.proton.documentdb.bucket_move.buckets_pending", Unit.BUCKET, "The number of buckets left to move"), + + // resource usage + CONTENT_PROTON_RESOURCE_USAGE_DISK("content.proton.resource_usage.disk", Unit.FRACTION, "The relative amount of disk used by this content node (transient usage not included, value in the range [0, 1]). Same value as reported to the cluster controller"), + CONTENT_PROTON_RESOURCE_USAGE_DISK_USAGE_TOTAL("content.proton.resource_usage.disk_usage.total", Unit.FRACTION, "The total relative amount of disk used by this content node (value in the range [0, 1])"), + CONTENT_PROTON_RESOURCE_USAGE_DISK_USAGE_TOTAL_UTILIZATION("content.proton.resource_usage.disk_usage.total_utilization", Unit.FRACTION, "The relative amount of disk used compared to the content node disk resource limit"), + CONTENT_PROTON_RESOURCE_USAGE_DISK_USAGE_TRANSIENT("content.proton.resource_usage.disk_usage.transient", Unit.FRACTION, "The relative amount of transient disk used by this content node (value in the range [0, 1])"), + CONTENT_PROTON_RESOURCE_USAGE_MEMORY("content.proton.resource_usage.memory", Unit.FRACTION, "The relative amount of memory used by this content node (transient usage not included, value in the range [0, 1]). Same value as reported to the cluster controller"), + CONTENT_PROTON_RESOURCE_USAGE_MEMORY_USAGE_TOTAL("content.proton.resource_usage.memory_usage.total", Unit.FRACTION, "The total relative amount of memory used by this content node (value in the range [0, 1])"), + CONTENT_PROTON_RESOURCE_USAGE_MEMORY_USAGE_TOTAL_UTILIZATION("content.proton.resource_usage.memory_usage.total_utilization", Unit.FRACTION, "The relative amount of memory used compared to the content node memory resource limit"), + CONTENT_PROTON_RESOURCE_USAGE_MEMORY_USAGE_TRANSIENT("content.proton.resource_usage.memory_usage.transient", Unit.FRACTION, "The relative amount of transient memory used by this content node (value in the range [0, 1])"), + CONTENT_PROTON_RESOURCE_USAGE_MEMORY_MAPPINGS("content.proton.resource_usage.memory_mappings", Unit.FILE, "The number of memory mapped files"), + CONTENT_PROTON_RESOURCE_USAGE_OPEN_FILE_DESCRIPTORS("content.proton.resource_usage.open_file_descriptors", Unit.FILE, "The number of open files"), + CONTENT_PROTON_RESOURCE_USAGE_FEEDING_BLOCKED("content.proton.resource_usage.feeding_blocked", Unit.BINARY, "Whether feeding is blocked due to resource limits being reached (value is either 0 or 1)"), + CONTENT_PROTON_RESOURCE_USAGE_MALLOC_ARENA("content.proton.resource_usage.malloc_arena", Unit.BYTE, "Size of malloc arena"), + CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_RESOURCE_USAGE_ADDRESS_SPACE("content.proton.documentdb.attribute.resource_usage.address_space", Unit.FRACTION, "The max relative address space used among components in all attribute vectors in this document db (value in the range [0, 1])"), + CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_RESOURCE_USAGE_FEEDING_BLOCKED("content.proton.documentdb.attribute.resource_usage.feeding_blocked", Unit.BINARY, "Whether feeding is blocked due to attribute resource limits being reached (value is either 0 or 1)"), + CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.attribute.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.attribute.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.attribute.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + CONTENT_PROTON_DOCUMENTDB_ATTRIBUTE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.attribute.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + + // CPU util + CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_SETUP("content.proton.resource_usage.cpu_util.setup", Unit.FRACTION, "cpu used by system init and (re-)configuration"), + CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_READ("content.proton.resource_usage.cpu_util.read", Unit.FRACTION, "cpu used by reading data from the system"), + CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_WRITE("content.proton.resource_usage.cpu_util.write", Unit.FRACTION, "cpu used by writing data to the system"), + CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_COMPACT("content.proton.resource_usage.cpu_util.compact", Unit.FRACTION, "cpu used by internal data re-structuring"), + CONTENT_PROTON_RESOURCE_USAGE_CPU_UTIL_OTHER("content.proton.resource_usage.cpu_util.other", Unit.FRACTION, "cpu used by work not classified as a specific category"), + + // transaction log + CONTENT_PROTON_TRANSACTIONLOG_ENTRIES("content.proton.transactionlog.entries", Unit.RECORD, "The current number of entries in the transaction log"), + CONTENT_PROTON_TRANSACTIONLOG_DISK_USAGE("content.proton.transactionlog.disk_usage", Unit.BYTE, "The disk usage (in bytes) of the transaction log"), + CONTENT_PROTON_TRANSACTIONLOG_REPLAY_TIME("content.proton.transactionlog.replay_time", Unit.SECOND, "The replay time (in seconds) of the transaction log during start-up"), + + // document store + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_DISK_USAGE("content.proton.documentdb.ready.document_store.disk_usage", Unit.BYTE, "Disk space usage in bytes"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_DISK_BLOAT("content.proton.documentdb.ready.document_store.disk_bloat", Unit.BYTE, "Disk space bloat in bytes"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MAX_BUCKET_SPREAD("content.proton.documentdb.ready.document_store.max_bucket_spread", Unit.FRACTION, "Max bucket spread in underlying files (sum(unique buckets in each chunk)/unique buckets in file)"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.ready.document_store.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.ready.document_store.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.ready.document_store.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.ready.document_store.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_DISK_USAGE("content.proton.documentdb.notready.document_store.disk_usage", Unit.BYTE, "Disk space usage in bytes"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_DISK_BLOAT("content.proton.documentdb.notready.document_store.disk_bloat", Unit.BYTE, "Disk space bloat in bytes"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MAX_BUCKET_SPREAD("content.proton.documentdb.notready.document_store.max_bucket_spread", Unit.FRACTION, "Max bucket spread in underlying files (sum(unique buckets in each chunk)/unique buckets in file)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.notready.document_store.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.notready.document_store.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.notready.document_store.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.notready.document_store.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_DISK_USAGE("content.proton.documentdb.removed.document_store.disk_usage", Unit.BYTE, "Disk space usage in bytes"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_DISK_BLOAT("content.proton.documentdb.removed.document_store.disk_bloat", Unit.BYTE, "Disk space bloat in bytes"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MAX_BUCKET_SPREAD("content.proton.documentdb.removed.document_store.max_bucket_spread", Unit.FRACTION, "Max bucket spread in underlying files (sum(unique buckets in each chunk)/unique buckets in file)"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.removed.document_store.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.removed.document_store.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.removed.document_store.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.removed.document_store.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + + // document store cache + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_ELEMENTS("content.proton.documentdb.ready.document_store.cache.elements", Unit.ITEM, "Number of elements in the cache"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_MEMORY_USAGE("content.proton.documentdb.ready.document_store.cache.memory_usage", Unit.BYTE, "Memory usage of the cache (in bytes)"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_HIT_RATE("content.proton.documentdb.ready.document_store.cache.hit_rate", Unit.FRACTION, "Rate of hits in the cache compared to number of lookups"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_LOOKUPS("content.proton.documentdb.ready.document_store.cache.lookups", Unit.OPERATION, "Number of lookups in the cache (hits + misses)"), + CONTENT_PROTON_DOCUMENTDB_READY_DOCUMENT_STORE_CACHE_INVALIDATIONS("content.proton.documentdb.ready.document_store.cache.invalidations", Unit.OPERATION, "Number of invalidations (erased elements) in the cache. "), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_ELEMENTS("content.proton.documentdb.notready.document_store.cache.elements", Unit.ITEM, "Number of elements in the cache"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_MEMORY_USAGE("content.proton.documentdb.notready.document_store.cache.memory_usage", Unit.BYTE, "Memory usage of the cache (in bytes)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_HIT_RATE("content.proton.documentdb.notready.document_store.cache.hit_rate", Unit.FRACTION, "Rate of hits in the cache compared to number of lookups"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_LOOKUPS("content.proton.documentdb.notready.document_store.cache.lookups", Unit.OPERATION, "Number of lookups in the cache (hits + misses)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_DOCUMENT_STORE_CACHE_INVALIDATIONS("content.proton.documentdb.notready.document_store.cache.invalidations", Unit.OPERATION, "Number of invalidations (erased elements) in the cache. "), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_ELEMENTS("content.proton.documentdb.removed.document_store.cache.elements", Unit.ITEM, "Number of elements in the cache"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_HIT_RATE("content.proton.documentdb.removed.document_store.cache.hit_rate", Unit.FRACTION, "Rate of hits in the cache compared to number of lookups"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_INVALIDATIONS("content.proton.documentdb.removed.document_store.cache.invalidations", Unit.ITEM, "Number of invalidations (erased elements) in the cache. "), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_LOOKUPS("content.proton.documentdb.removed.document_store.cache.lookups", Unit.OPERATION, "Number of lookups in the cache (hits + misses)"), + CONTENT_PROTON_DOCUMENTDB_REMOVED_DOCUMENT_STORE_CACHE_MEMORY_USAGE("content.proton.documentdb.removed.document_store.cache.memory_usage", Unit.BYTE, "Memory usage of the cache (in bytes)"), + + // attribute + CONTENT_PROTON_DOCUMENTDB_READY_ATTRIBUTE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.ready.attribute.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + CONTENT_PROTON_DOCUMENTDB_READY_ATTRIBUTE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.ready.attribute.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + CONTENT_PROTON_DOCUMENTDB_READY_ATTRIBUTE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.ready.attribute.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + CONTENT_PROTON_DOCUMENTDB_READY_ATTRIBUTE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.ready.attribute.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_ATTRIBUTE_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.notready.attribute.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_ATTRIBUTE_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.notready.attribute.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_ATTRIBUTE_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.notready.attribute.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + CONTENT_PROTON_DOCUMENTDB_NOTREADY_ATTRIBUTE_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.notready.attribute.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + + // index + CONTENT_PROTON_DOCUMENTDB_INDEX_MEMORY_USAGE_ALLOCATED_BYTES("content.proton.documentdb.index.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + CONTENT_PROTON_DOCUMENTDB_INDEX_MEMORY_USAGE_USED_BYTES("content.proton.documentdb.index.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + CONTENT_PROTON_DOCUMENTDB_INDEX_MEMORY_USAGE_DEAD_BYTES("content.proton.documentdb.index.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + CONTENT_PROTON_DOCUMENTDB_INDEX_MEMORY_USAGE_ONHOLD_BYTES("content.proton.documentdb.index.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + CONTENT_PROTON_DOCUMENTDB_INDEX_DISK_USAGE("content.proton.documentdb.index.disk_usage", Unit.BYTE, "Disk space usage in bytes"), + + // matching + CONTENT_PROTON_DOCUMENTDB_MATCHING_QUERIES("content.proton.documentdb.matching.queries", Unit.QUERY, "Number of queries executed"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_SOFT_DOOMED_QUERIES("content.proton.documentdb.matching.soft_doomed_queries", Unit.QUERY, "Number of queries hitting the soft timeout"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_QUERY_LATENCY("content.proton.documentdb.matching.query_latency", Unit.SECOND, "Total average latency (sec) when matching and ranking a query"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_QUERY_SETUP_TIME("content.proton.documentdb.matching.query_setup_time", Unit.SECOND, "Average time (sec) spent setting up and tearing down queries"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_DOCS_MATCHED("content.proton.documentdb.matching.docs_matched", Unit.DOCUMENT, "Number of documents matched"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_DOCS_RANKED("content.proton.documentdb.matching.docs_ranked", Unit.DOCUMENT, "Number of documents ranked (first phase)"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_DOCS_RERANKED("content.proton.documentdb.matching.docs_reranked", Unit.DOCUMENT, "Number of documents re-ranked (second phase)"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_QUERIES("content.proton.documentdb.matching.rank_profile.queries", Unit.QUERY, "Number of queries executed"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_SOFT_DOOMED_QUERIES("content.proton.documentdb.matching.rank_profile.soft_doomed_queries", Unit.QUERY, "Number of queries hitting the soft timeout"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_SOFT_DOOM_FACTOR("content.proton.documentdb.matching.rank_profile.soft_doom_factor", Unit.FRACTION, "Factor used to compute soft-timeout"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_QUERY_LATENCY("content.proton.documentdb.matching.rank_profile.query_latency", Unit.SECOND, "Total average latency (sec) when matching and ranking a query"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_QUERY_SETUP_TIME("content.proton.documentdb.matching.rank_profile.query_setup_time", Unit.SECOND, "Average time (sec) spent setting up and tearing down queries"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_GROUPING_TIME("content.proton.documentdb.matching.rank_profile.grouping_time", Unit.SECOND, "Average time (sec) spent on grouping"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_RERANK_TIME("content.proton.documentdb.matching.rank_profile.rerank_time", Unit.SECOND, "Average time (sec) spent on 2nd phase ranking"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCS_MATCHED("content.proton.documentdb.matching.rank_profile.docs_matched", Unit.DOCUMENT, "Number of documents matched"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCS_RANKED("content.proton.documentdb.matching.rank_profile.docs_ranked", Unit.DOCUMENT, "Number of documents ranked (first phase)"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCS_RERANKED("content.proton.documentdb.matching.rank_profile.docs_reranked", Unit.DOCUMENT, "Number of documents re-ranked (second phase)"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_LIMITED_QUERIES("content.proton.documentdb.matching.rank_profile.limited_queries", Unit.QUERY, "Number of queries limited in match phase"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_ACTIVE_TIME("content.proton.documentdb.matching.rank_profile.docid_partition.active_time", Unit.SECOND, "Time (sec) spent doing actual work"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_DOCS_MATCHED("content.proton.documentdb.matching.rank_profile.docid_partition.docs_matched", Unit.DOCUMENT, "Number of documents matched"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_DOCS_RANKED("content.proton.documentdb.matching.rank_profile.docid_partition.docs_ranked", Unit.DOCUMENT, "Number of documents ranked (first phase)"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_DOCS_RERANKED("content.proton.documentdb.matching.rank_profile.docid_partition.docs_reranked", Unit.DOCUMENT, "Number of documents re-ranked (second phase)"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_DOCID_PARTITION_WAIT_TIME("content.proton.documentdb.matching.rank_profile.docid_partition.wait_time", Unit.SECOND, "Time (sec) spent waiting for other external threads and resources"), + CONTENT_PROTON_DOCUMENTDB_MATCHING_RANK_PROFILE_MATCH_TIME("content.proton.documentdb.matching.rank_profile.match_time", Unit.SECOND, "Average time (sec) for matching a query (1st phase)"), + + // feeding + CONTENT_PROTON_DOCUMENTDB_FEEDING_COMMIT_OPERATIONS("content.proton.documentdb.feeding.commit.operations", Unit.OPERATION, "Number of operations included in a commit"), + CONTENT_PROTON_DOCUMENTDB_FEEDING_COMMIT_LATENCY("content.proton.documentdb.feeding.commit.latency", Unit.SECOND, "Latency for commit in seconds"), + + + // Metrics emitters not used in any metrics sets + CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_CACHED("content.proton.session_cache.grouping.num_cached", Unit.SESSION, "Number of currently cached sessions"), + CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_DROPPED("content.proton.session_cache.grouping.num_dropped", Unit.SESSION, "Number of dropped cached sessions"), + CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_INSERT("content.proton.session_cache.grouping.num_insert", Unit.SESSION, "Number of inserted sessions"), + CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_PICK("content.proton.session_cache.grouping.num_pick", Unit.SESSION, "Number if picked sessions"), + CONTENT_PROTON_SESSION_CACHE_GROUPING_NUM_TIMEDOUT("content.proton.session_cache.grouping.num_timedout", Unit.SESSION, "Number of timed out sessions"), + CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_CACHED("content.proton.session_cache.search.num_cached", Unit.SESSION, "Number of currently cached sessions"), + CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_DROPPED("content.proton.session_cache.search.num_dropped", Unit.SESSION, "Number of dropped cached sessions"), + CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_INSERT("content.proton.session_cache.search.num_insert", Unit.SESSION, "Number of inserted sessions"), + CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_PICK("content.proton.session_cache.search.num_pick", Unit.SESSION, "Number if picked sessions"), + CONTENT_PROTON_SESSION_CACHE_SEARCH_NUM_TIMEDOUT("content.proton.session_cache.search.num_timedout", Unit.SESSION, "Number of timed out sessions"), + + METRICMANAGER_PERIODICHOOKLATENCY("metricmanager.periodichooklatency", Unit.MILLISECOND, "Time in ms used to update a single periodic hook"), + METRICMANAGER_RESETLATENCY("metricmanager.resetlatency", Unit.MILLISECOND, "Time in ms used to reset all metrics."), + METRICMANAGER_SLEEPTIME("metricmanager.sleeptime", Unit.MILLISECOND, "Time in ms worker thread is sleeping"), + METRICMANAGER_SNAPSHOTHOOKLATENCY("metricmanager.snapshothooklatency", Unit.MILLISECOND, "Time in ms used to update a single snapshot hook"), + METRICMANAGER_SNAPSHOTLATENCY("metricmanager.snapshotlatency", Unit.MILLISECOND, "Time in ms used to take a snapshot"); + + + private final String name; + private final Unit unit; + private final String description; + + SearchNodeMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/SentinelMetrics.java b/metrics/src/main/java/ai/vespa/metrics/SentinelMetrics.java new file mode 100644 index 00000000000..35ecbae85d8 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/SentinelMetrics.java @@ -0,0 +1,36 @@ +package ai.vespa.metrics; + +/** + * @author yngve + */ +public enum SentinelMetrics implements VespaMetrics { + + SENTINEL_RESTARTS("sentinel.restarts", Unit.RESTART, "Number of service restarts done by the sentinel"), + SENTINEL_TOTAL_RESTARTS("sentinel.totalRestarts", Unit.RESTART, "Total number of service restarts done by the sentinel since the sentinel was started"), + SENTINEL_UPTIME("sentinel.uptime", Unit.SECOND, "Time the sentinel has been running"), + SENTINEL_RUNNING("sentinel.running", Unit.INSTANCE, "Number of services the sentinel has running currently"); + + + private final String name; + private final Unit unit; + private final String description; + + SentinelMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/SlobrokMetrics.java b/metrics/src/main/java/ai/vespa/metrics/SlobrokMetrics.java new file mode 100644 index 00000000000..1a6735af860 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/SlobrokMetrics.java @@ -0,0 +1,37 @@ +package ai.vespa.metrics; + +/** + * @author yngve + */ +public enum SlobrokMetrics implements VespaMetrics { + + SLOBROK_HEARTBEATS_FAILED("slobrok.heartbeats.failed", Unit.REQUEST, "Number of heartbeat requests failed"), + SLOBROK_REQUESTS_REGISTER("slobrok.requests.register", Unit.REQUEST, "Number of register requests received"), + SLOBROK_REQUESTS_MIRROR("slobrok.requests.mirror", Unit.REQUEST, "Number of mirroring requests received"), + SLOBROK_REQUESTS_ADMIN("slobrok.requests.admin", Unit.REQUEST, "Number of administrative requests received"), + SLOBROK_MISSING_CONSENSUS("slobrok.missing.consensus", Unit.SECOND, "Number of seconds without full consensus with all other brokers"); + + + private final String name; + private final Unit unit; + private final String description; + + SlobrokMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java b/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java new file mode 100644 index 00000000000..7071fe0ae77 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java @@ -0,0 +1,253 @@ +package ai.vespa.metrics; + +/** + * @author yngveaasheim + */ +public enum StorageMetrics implements VespaMetrics { + + VDS_DATASTORED_ALLDISKS_BUCKETS("vds.datastored.alldisks.buckets", Unit.BUCKET, "Number of buckets managed"), + VDS_DATASTORED_ALLDISKS_DOCS("vds.datastored.alldisks.docs", Unit.DOCUMENT, "Number of documents stored"), + VDS_DATASTORED_ALLDISKS_BYTES("vds.datastored.alldisks.bytes", Unit.BYTE, "Number of bytes stored"), + VDS_DATASTORED_ALLDISKS_ACTIVEBUCKETS("vds.datastored.alldisks.activebuckets", Unit.BUCKET, "Number of active buckets on the node"), + VDS_DATASTORED_ALLDISKS_READYBUCKETS("vds.datastored.alldisks.readybuckets", Unit.BUCKET, "Number of ready buckets on the node"), + + VDS_VISITOR_ALLTHREADS_AVERAGEVISITORLIFETIME("vds.visitor.allthreads.averagevisitorlifetime", Unit.MILLISECOND, "Average lifetime of a visitor"), + VDS_VISITOR_ALLTHREADS_AVERAGEQUEUEWAIT("vds.visitor.allthreads.averagequeuewait", Unit.MILLISECOND, "Average time an operation spends in input queue."), + VDS_VISITOR_ALLTHREADS_QUEUESIZE("vds.visitor.allthreads.queuesize", Unit.OPERATION, "Size of input message queue."), + VDS_VISITOR_ALLTHREADS_COMPLETED("vds.visitor.allthreads.completed", Unit.OPERATION, "Number of visitors completed"), + VDS_VISITOR_ALLTHREADS_CREATED("vds.visitor.allthreads.created", Unit.OPERATION, "Number of visitors created."), + VDS_VISITOR_ALLTHREADS_FAILED("vds.visitor.allthreads.failed", Unit.OPERATION, "Number of visitors failed"), + VDS_VISITOR_ALLTHREADS_AVERAGEMESSAGESENDTIME("vds.visitor.allthreads.averagemessagesendtime", Unit.MILLISECOND, "Average time it takes for messages to be sent to their target (and be replied to)"), + VDS_VISITOR_ALLTHREADS_AVERAGEPROCESSINGTIME("vds.visitor.allthreads.averageprocessingtime", Unit.MILLISECOND, "Average time used to process visitor requests"), + VDS_VISITOR_ALLTHREADS_ABORTED("vds.visitor.allthreads.aborted", Unit.INSTANCE, "Number of visitors aborted."), + VDS_VISITOR_ALLTHREADS_AVERAGEVISITORCREATIONTIME("vds.visitor.allthreads.averagevisitorcreationtime", Unit.MILLISECOND, "Average time spent creating a visitor instance"), + VDS_VISITOR_ALLTHREADS_DESTINATION_FAILURE_REPLIES("vds.visitor.allthreads.destination_failure_replies", Unit.INSTANCE, "Number of failure replies received from the visitor destination"), + + VDS_FILESTOR_QUEUESIZE("vds.filestor.queuesize", Unit.OPERATION, "Size of input message queue."), + VDS_FILESTOR_AVERAGEQUEUEWAIT("vds.filestor.averagequeuewait", Unit.MILLISECOND, "Average time an operation spends in input queue."), + VDS_FILESTOR_ACTIVE_OPERATIONS_SIZE("vds.filestor.active_operations.size", Unit.OPERATION, "Number of concurrent active operations"), + VDS_FILESTOR_ACTIVE_OPERATIONS_LATENCY("vds.filestor.active_operations.latency", Unit.MILLISECOND, "Latency (in ms) for completed operations"), // TODO Vespa 9: Remove 'active' from the metric name + VDS_FILESTOR_THROTTLE_WINDOW_SIZE("vds.filestor.throttle_window_size", Unit.OPERATION, "Current size of async operation throttler window size"), + VDS_FILESTOR_THROTTLE_WAITING_THREADS("vds.filestor.throttle_waiting_threads", Unit.THREAD, "Number of threads waiting to acquire a throttle token"), + VDS_FILESTOR_THROTTLE_ACTIVE_TOKENS("vds.filestor.throttle_active_tokens", Unit.INSTANCE, "Current number of active throttle tokens"), + VDS_FILESTOR_ALLTHREADS_MERGEMETADATAREADLATENCY("vds.filestor.allthreads.mergemetadatareadlatency", Unit.MILLISECOND, "Time spent in a merge step to check metadata of current node to see what data it has."), + VDS_FILESTOR_ALLTHREADS_MERGEDATAREADLATENCY("vds.filestor.allthreads.mergedatareadlatency", Unit.MILLISECOND, "Time spent in a merge step to read data other nodes need."), + VDS_FILESTOR_ALLTHREADS_MERGEDATAWRITELATENCY("vds.filestor.allthreads.mergedatawritelatency", Unit.MILLISECOND, "Time spent in a merge step to write data needed to current node."), + VDS_FILESTOR_ALLTHREADS_MERGEAVGDATARECEIVEDNEEDED("vds.filestor.allthreads.mergeavgdatareceivedneeded", Unit.BYTE, "Amount of data transferred from previous node in chain that we needed to apply locally."), + VDS_FILESTOR_ALLTHREADS_MERGEBUCKETS_COUNT("vds.filestor.allthreads.mergebuckets.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_MERGEBUCKETS_FAILED("vds.filestor.allthreads.mergebuckets.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_MERGEBUCKETS_LATENCY("vds.filestor.allthreads.mergebuckets.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_MERGELATENCYTOTAL("vds.filestor.allthreads.mergelatencytotal", Unit.MILLISECOND, "Latency of total merge operation, from master node receives it, until merge is complete and master node replies."), + VDS_FILESTOR_ALLTHREADS_MERGE_PUT_LATENCY("vds.filestor.allthreads.put_latency", Unit.MILLISECOND, "Latency of individual puts that are part of merge operations"), // TODO Vespa 9: Update metric name to include 'merge' + VDS_FILESTOR_ALLTHREADS_MERGE_REMOVE_LATENCY("vds.filestor.allthreads.remove_latency", Unit.MILLISECOND, "Latency of individual removes that are part of merge operations"), // TODO Vespa 9: Update metric name to include 'merge' + VDS_FILESTOR_ALLSTRIPES_THROTTLED_RPC_DIRECT_DISPATCHES("vds.filestor.allstripes.throttled_rpc_direct_dispatches", Unit.INSTANCE, "Number of times an RPC thread could not directly dispatch an async operation directly to Proton because it was disallowed by the throttle policy"), + VDS_FILESTOR_ALLSTRIPES_THROTTLED_PERSISTENCE_THREAD_POLLS("vds.filestor.allstripes.throttled_persistence_thread_polls", Unit.INSTANCE, "Number of times a persistence thread could not immediately dispatch a queued async operation because it was disallowed by the throttle policy"), + VDS_FILESTOR_ALLSTRIPES_TIMEOUTS_WAITING_FOR_THROTTLE_TOKEN("vds.filestor.allstripes.timeouts_waiting_for_throttle_token", Unit.INSTANCE, "Number of times a persistence thread timed out waiting for an available throttle policy token"), + VDS_FILESTOR_ALLSTRIPES_AVERAGEQUEUEWAIT("vds.filestor.allstripes.averagequeuewait", Unit.MILLISECOND, "Average time an operation spends in input queue."), + + VDS_FILESTOR_ALLTHREADS_PUT_COUNT("vds.filestor.allthreads.put.count", Unit.OPERATION, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_PUT_FAILED("vds.filestor.allthreads.put.failed", Unit.OPERATION, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_PUT_TEST_AND_SET_FAILED("vds.filestor.allthreads.put.test_and_set_failed", Unit.OPERATION, "Number of operations that were skipped due to a test-and-set condition not met"), + VDS_FILESTOR_ALLTHREADS_PUT_LATENCY("vds.filestor.allthreads.put.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_PUT_REQUEST_SIZE("vds.filestor.allthreads.put.request_size", Unit.BYTE, "Size of requests, in bytes"), + VDS_FILESTOR_ALLTHREADS_REMOVE_COUNT("vds.filestor.allthreads.remove.count", Unit.OPERATION, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_REMOVE_FAILED("vds.filestor.allthreads.remove.failed", Unit.OPERATION, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_REMOVE_TEST_AND_SET_FAILED("vds.filestor.allthreads.remove.test_and_set_failed", Unit.OPERATION, "Number of operations that were skipped due to a test-and-set condition not met"), + VDS_FILESTOR_ALLTHREADS_REMOVE_LATENCY("vds.filestor.allthreads.remove.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_REMOVE_REQUEST_SIZE("vds.filestor.allthreads.remove.request_size", Unit.BYTE, "Size of requests, in bytes"), + VDS_FILESTOR_ALLTHREADS_REMOVE_NOT_FOUND("vds.filestor.allthreads.remove.not_found", Unit.REQUEST, "Number of requests that could not be completed due to source document not found."), + + VDS_FILESTOR_ALLTHREADS_GET_COUNT("vds.filestor.allthreads.get.count", Unit.OPERATION, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_GET_FAILED("vds.filestor.allthreads.get.failed", Unit.OPERATION, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_GET_LATENCY("vds.filestor.allthreads.get.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_GET_REQUEST_SIZE("vds.filestor.allthreads.get.request_size", Unit.BYTE, "Size of requests, in bytes"), + VDS_FILESTOR_ALLTHREADS_GET_NOT_FOUND("vds.filestor.allthreads.get.not_found", Unit.REQUEST, "Number of requests that could not be completed due to source document not found."), + VDS_FILESTOR_ALLTHREADS_UPDATE_COUNT("vds.filestor.allthreads.update.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_UPDATE_FAILED("vds.filestor.allthreads.update.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_UPDATE_TEST_AND_SET_FAILED("vds.filestor.allthreads.update.test_and_set_failed", Unit.REQUEST, "Number of requests that were skipped due to a test-and-set condition not met"), + VDS_FILESTOR_ALLTHREADS_UPDATE_LATENCY("vds.filestor.allthreads.update.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_UPDATE_REQUEST_SIZE("vds.filestor.allthreads.update.request_size", Unit.BYTE, "Size of requests, in bytes"), + VDS_FILESTOR_ALLTHREADS_UPDATE_LATENCY_READ("vds.filestor.allthreads.update.latency_read", Unit.MILLISECOND, "Latency of the source read in the request."), + VDS_FILESTOR_ALLTHREADS_UPDATE_NOT_FOUND("vds.filestor.allthreads.update.not_found", Unit.REQUEST, "Number of requests that could not be completed due to source document not found."), + VDS_FILESTOR_ALLTHREADS_CREATEITERATOR_COUNT("vds.filestor.allthreads.createiterator.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_CREATEITERATOR_LATENCY("vds.filestor.allthreads.createiterator.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_CREATEITERATOR_FAILED("vds.filestor.allthreads.createiterator.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_VISIT_COUNT("vds.filestor.allthreads.visit.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_VISIT_LATENCY("vds.filestor.allthreads.visit.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_VISIT_DOCS("vds.filestor.allthreads.visit.docs", Unit.DOCUMENT, "Number of entries read per iterate call"), + VDS_FILESTOR_ALLTHREADS_VISIT_FAILED("vds.filestor.allthreads.visit.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_REMOVE_LOCATION_COUNT("vds.filestor.allthreads.remove_location.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_REMOVE_LOCATION_LATENCY("vds.filestor.allthreads.remove_location.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_REMOVE_LOCATION_FAILED("vds.filestor.allthreads.remove_location.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_SPLITBUCKETS_COUNT("vds.filestor.allthreads.splitbuckets.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_SPLITBUCKETS_FAILED("vds.filestor.allthreads.splitbuckets.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_SPLITBUCKETS_LATENCY("vds.filestor.allthreads.splitbuckets.latency", Unit.REQUEST, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_JOINBUCKETS_COUNT("vds.filestor.allthreads.joinbuckets.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_JOINBUCKETS_FAILED("vds.filestor.allthreads.joinbuckets.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_JOINBUCKETS_LATENCY("vds.filestor.allthreads.joinbuckets.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_DELETEBUCKETS_COUNT("vds.filestor.allthreads.deletebuckets.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_DELETEBUCKETS_FAILED("vds.filestor.allthreads.deletebuckets.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_DELETEBUCKETS_LATENCY("vds.filestor.allthreads.deletebuckets.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_SETBUCKETSTATES_COUNT("vds.filestor.allthreads.setbucketstates.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_SETBUCKETSTATES_FAILED("vds.filestor.allthreads.setbucketstates.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_SETBUCKETSTATES_LATENCY("vds.filestor.allthreads.setbucketstates.latency", Unit.MILLISECOND, "Latency of successful requests."), + + VDS_MERGETHROTTLER_AVERAGEQUEUEWAITINGTIME("vds.mergethrottler.averagequeuewaitingtime", Unit.MILLISECOND, "Time merges spent in the throttler queue"), + VDS_MERGETHROTTLER_QUEUESIZE("vds.mergethrottler.queuesize", Unit.INSTANCE, "Length of merge queue"), + VDS_MERGETHROTTLER_ACTIVE_WINDOW_SIZE("vds.mergethrottler.active_window_size", Unit.INSTANCE, "Number of merges active within the pending window size"), + VDS_MERGETHROTTLER_BOUNCED_DUE_TO_BACK_PRESSURE("vds.mergethrottler.bounced_due_to_back_pressure", Unit.INSTANCE, "Number of merges bounced due to resource exhaustion back-pressure"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_OK("vds.mergethrottler.locallyexecutedmerges.ok", Unit.INSTANCE, "The number of successful merges for 'locallyexecutedmerges'"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_ABORTED("vds.mergethrottler.locallyexecutedmerges.failures.aborted", Unit.OPERATION, "The number of merges that failed because the storage node was (most likely) shutting down"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_BUCKETNOTFOUND("vds.mergethrottler.locallyexecutedmerges.failures.bucketnotfound", Unit.OPERATION, "The number of operations that failed because the bucket did not exist"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_BUSY("vds.mergethrottler.locallyexecutedmerges.failures.busy", Unit.OPERATION, "The number of merges that failed because the storage node was busy"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_EXISTS("vds.mergethrottler.locallyexecutedmerges.failures.exists", Unit.OPERATION, "The number of merges that were rejected due to a merge operation for their bucket already being processed"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_NOTREADY("vds.mergethrottler.locallyexecutedmerges.failures.notready", Unit.OPERATION, "The number of merges discarded because distributor was not ready"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_OTHER("vds.mergethrottler.locallyexecutedmerges.failures.other", Unit.OPERATION, "The number of other failures"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_REJECTED("vds.mergethrottler.locallyexecutedmerges.failures.rejected", Unit.OPERATION, "The number of merges that were rejected"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_TIMEOUT("vds.mergethrottler.locallyexecutedmerges.failures.timeout", Unit.OPERATION, "The number of merges that failed because they timed out towards storage"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_TOTAL("vds.mergethrottler.locallyexecutedmerges.failures.total", Unit.OPERATION, "Sum of all failures"), + VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_FAILURES_WRONGDISTRIBUTION("vds.mergethrottler.locallyexecutedmerges.failures.wrongdistribution", Unit.OPERATION, "The number of merges that were discarded (flushed) because they were initiated at an older cluster state than the current"), + VDS_MERGETHROTTLER_MERGECHAINS_OK("vds.mergethrottler.mergechains.ok", Unit.OPERATION, "The number of successful merges for 'mergechains'"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_BUSY("vds.mergethrottler.mergechains.failures.busy", Unit.OPERATION, "The number of merges that failed because the storage node was busy"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_TOTAL("vds.mergethrottler.mergechains.failures.total", Unit.OPERATION, "Sum of all failures"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_EXISTS("vds.mergethrottler.mergechains.failures.exists", Unit.OPERATION, "The number of merges that were rejected due to a merge operation for their bucket already being processed"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_NOTREADY("vds.mergethrottler.mergechains.failures.notready", Unit.OPERATION, "The number of merges discarded because distributor was not ready"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_OTHER("vds.mergethrottler.mergechains.failures.other", Unit.OPERATION, "The number of other failures"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_REJECTED("vds.mergethrottler.mergechains.failures.rejected", Unit.OPERATION, "The number of merges that were rejected"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_TIMEOUT("vds.mergethrottler.mergechains.failures.timeout", Unit.OPERATION, "The number of merges that failed because they timed out towards storage"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_WRONGDISTRIBUTION("vds.mergethrottler.mergechains.failures.wrongdistribution", Unit.OPERATION, "The number of merges that were discarded (flushed) because they were initiated at an older cluster state than the current"), + + + // C++ TLS metrics - these come from both the distributor and storage + VDS_SERVER_NETWORK_TLS_HANDSHAKES_FAILED("vds.server.network.tls-handshakes-failed", Unit.OPERATION, "Number of client or server connection attempts that failed during TLS handshaking"), + VDS_SERVER_NETWORK_PEER_AUTHORIZATION_FAILURES("vds.server.network.peer-authorization-failures", Unit.FAILURE, "Number of TLS connection attempts failed due to bad or missing peer certificate credentials"), + VDS_SERVER_NETWORK_CLIENT_TLS_CONNECTIONS_ESTABLISHED("vds.server.network.client.tls-connections-established", Unit.CONNECTION, "Number of secure mTLS connections established"), + VDS_SERVER_NETWORK_SERVER_TLS_CONNECTIONS_ESTABLISHED("vds.server.network.server.tls-connections-established", Unit.CONNECTION, "Number of secure mTLS connections established"), + VDS_SERVER_NETWORK_CLIENT_INSECURE_CONNECTIONS_ESTABLISHED("vds.server.network.client.insecure-connections-established", Unit.CONNECTION, "Number of insecure (plaintext) connections established"), + VDS_SERVER_NETWORK_SERVER_INSECURE_CONNECTIONS_ESTABLISHED("vds.server.network.server.insecure-connections-established", Unit.CONNECTION, "Number of insecure (plaintext) connections established"), + VDS_SERVER_NETWORK_TLS_CONNECTIONS_BROKEN("vds.server.network.tls-connections-broken", Unit.CONNECTION, "Number of TLS connections broken due to failures during frame encoding or decoding"), + VDS_SERVER_NETWORK_FAILED_TLS_CONFIG_RELOADS("vds.server.network.failed-tls-config-reloads", Unit.FAILURE, "Number of times background reloading of TLS config has failed"), + + VDS_BOUNCER_UNAVAILABLE_NODE_ABORTS("vds.bouncer.unavailable_node_aborts", Unit.OPERATION, "Number of operations that were aborted due to the node (or target bucket space) being unavailable"), + VDS_CHANGEDBUCKETOWNERSHIPHANDLER_AVG_ABORT_PROCESSING_TIME("vds.changedbucketownershiphandler.avg_abort_processing_time", Unit.MILLISECOND, "Average time spent aborting operations for changed buckets"), + VDS_CHANGEDBUCKETOWNERSHIPHANDLER_EXTERNAL_LOAD_OPS_ABORTED("vds.changedbucketownershiphandler.external_load_ops_aborted", Unit.OPERATION, "Number of outdated external load operations aborted"), + VDS_CHANGEDBUCKETOWNERSHIPHANDLER_IDEAL_STATE_OPS_ABORTED("vds.changedbucketownershiphandler.ideal_state_ops_aborted", Unit.OPERATION, "Number of outdated ideal state operations aborted"), + VDS_COMMUNICATION_BUCKET_SPACE_MAPPING_FAILURES("vds.communication.bucket_space_mapping_failures", Unit.OPERATION, "Number of messages that could not be resolved to a known bucket space"), + VDS_COMMUNICATION_CONVERTFAILURES("vds.communication.convertfailures", Unit.OPERATION, "Number of messages that failed to get converted to storage API messages"), + VDS_COMMUNICATION_EXCEPTIONMESSAGEPROCESSTIME("vds.communication.exceptionmessageprocesstime", Unit.MILLISECOND, "Time transport thread uses to process a single message that fails with an exception thrown into communication manager"), + VDS_COMMUNICATION_MESSAGEPROCESSTIME("vds.communication.messageprocesstime", Unit.MILLISECOND, "Time transport thread uses to process a single message"), + VDS_COMMUNICATION_MESSAGEQUEUE("vds.communication.messagequeue", Unit.ITEM, "Size of input message queue."), + VDS_COMMUNICATION_SENDCOMMANDLATENCY("vds.communication.sendcommandlatency", Unit.MILLISECOND, "Average ms used to send commands to MBUS"), + VDS_COMMUNICATION_SENDREPLYLATENCY("vds.communication.sendreplylatency", Unit.MILLISECOND, "Average ms used to send replies to MBUS"), + VDS_COMMUNICATION_TOOLITTLEMEMORY("vds.communication.toolittlememory", Unit.OPERATION, "Number of messages failed due to too little memory available"), + + VDS_DATASTORED_BUCKET_SPACE_ACTIVE_BUCKETS("vds.datastored.bucket_space.active_buckets", Unit.BUCKET, "Number of active buckets in the bucket space"), + VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_ALLOCATED_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.allocated_bytes", Unit.BYTE, "The number of allocated bytes"), + VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_DEAD_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.dead_bytes", Unit.BYTE, "The number of dead bytes (<= used_bytes)"), + VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_ONHOLD_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"), + VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_USED_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"), + VDS_DATASTORED_BUCKET_SPACE_BUCKETS_TOTAL("vds.datastored.bucket_space.buckets_total", Unit.BUCKET, "Total number buckets present in the bucket space (ready + not ready)"), + VDS_DATASTORED_BUCKET_SPACE_BYTES("vds.datastored.bucket_space.bytes", Unit.BYTE, "Bytes stored across all documents in the bucket space"), + VDS_DATASTORED_BUCKET_SPACE_DOCS("vds.datastored.bucket_space.docs", Unit.DOCUMENT, "Documents stored in the bucket space"), + VDS_DATASTORED_BUCKET_SPACE_READY_BUCKETS("vds.datastored.bucket_space.ready_buckets", Unit.BUCKET, "Number of ready buckets in the bucket space"), + VDS_DATASTORED_FULLBUCKETINFOLATENCY("vds.datastored.fullbucketinfolatency", Unit.MILLISECOND, "Amount of time spent to process a full bucket info request"), + VDS_DATASTORED_FULLBUCKETINFOREQSIZE("vds.datastored.fullbucketinforeqsize", Unit.NODE, "Amount of distributors answered at once in full bucket info requests."), + VDS_DATASTORED_SIMPLEBUCKETINFOREQSIZE("vds.datastored.simplebucketinforeqsize", Unit.BUCKET, "Amount of buckets returned in simple bucket info requests"), + + VDS_FILESTOR_ALLTHREADS_APPLYBUCKETDIFF_COUNT("vds.filestor.allthreads.applybucketdiff.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_APPLYBUCKETDIFF_FAILED("vds.filestor.allthreads.applybucketdiff.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_APPLYBUCKETDIFF_LATENCY("vds.filestor.allthreads.applybucketdiff.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_APPLYBUCKETDIFFREPLY("vds.filestor.allthreads.applybucketdiffreply", Unit.REQUEST, "Number of applybucketdiff replies that have been processed."), + VDS_FILESTOR_ALLTHREADS_BUCKETFIXED("vds.filestor.allthreads.bucketfixed", Unit.BUCKET, "Number of times bucket has been fixed because of corruption"), + VDS_FILESTOR_ALLTHREADS_BUCKETVERIFIED_COUNT("vds.filestor.allthreads.bucketverified.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_BUCKETVERIFIED_FAILED("vds.filestor.allthreads.bucketverified.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_BUCKETVERIFIED_LATENCY("vds.filestor.allthreads.bucketverified.latency", Unit.REQUEST, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_BYTESMERGED("vds.filestor.allthreads.bytesmerged", Unit.BYTE, "Total number of bytes merged into this node."), + VDS_FILESTOR_ALLTHREADS_CREATEBUCKETS_COUNT("vds.filestor.allthreads.createbuckets.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_CREATEBUCKETS_FAILED("vds.filestor.allthreads.createbuckets.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_CREATEBUCKETS_LATENCY("vds.filestor.allthreads.createbuckets.latency", Unit.REQUEST, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_FAILEDOPERATIONS("vds.filestor.allthreads.failedoperations", Unit.OPERATION, "Number of operations throwing exceptions."), + VDS_FILESTOR_ALLTHREADS_GETBUCKETDIFF_COUNT("vds.filestor.allthreads.getbucketdiff.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_GETBUCKETDIFF_FAILED("vds.filestor.allthreads.getbucketdiff.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_GETBUCKETDIFF_LATENCY("vds.filestor.allthreads.getbucketdiff.latency", Unit.REQUEST, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_GETBUCKETDIFFREPLY("vds.filestor.allthreads.getbucketdiffreply", Unit.REQUEST, "Number of getbucketdiff replies that have been processed."), + VDS_FILESTOR_ALLTHREADS_INTERNALJOIN_COUNT("vds.filestor.allthreads.internaljoin.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_INTERNALJOIN_FAILED("vds.filestor.allthreads.internaljoin.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_INTERNALJOIN_LATENCY("vds.filestor.allthreads.internaljoin.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_MOVEDBUCKETS_COUNT("vds.filestor.allthreads.movedbuckets.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_MOVEDBUCKETS_FAILED("vds.filestor.allthreads.movedbuckets.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_MOVEDBUCKETS_LATENCY("vds.filestor.allthreads.movedbuckets.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_OPERATIONS("vds.filestor.allthreads.operations", Unit.OPERATION, "Number of operations processed."), + + VDS_FILESTOR_ALLTHREADS_READBUCKETINFO_COUNT("vds.filestor.allthreads.readbucketinfo.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_READBUCKETINFO_FAILED("vds.filestor.allthreads.readbucketinfo.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_READBUCKETINFO_LATENCY("vds.filestor.allthreads.readbucketinfo.latency", Unit.REQUEST, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_READBUCKETLIST_COUNT("vds.filestor.allthreads.readbucketlist.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_READBUCKETLIST_FAILED("vds.filestor.allthreads.readbucketlist.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_READBUCKETLIST_LATENCY("vds.filestor.allthreads.readbucketlist.latency", Unit.MILLISECOND, "Latency of successful requests."), + + VDS_FILESTOR_ALLTHREADS_RECHECKBUCKETINFO_COUNT("vds.filestor.allthreads.recheckbucketinfo.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_RECHECKBUCKETINFO_FAILED("vds.filestor.allthreads.recheckbucketinfo.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_RECHECKBUCKETINFO_LATENCY("vds.filestor.allthreads.recheckbucketinfo.latency", Unit.MILLISECOND, "Latency of successful requests."), + + VDS_FILESTOR_ALLTHREADS_REVERT_COUNT("vds.filestor.allthreads.revert.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_REVERT_FAILED("vds.filestor.allthreads.revert.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_REVERT_LATENCY("vds.filestor.allthreads.revert.latency", Unit.MILLISECOND, "Latency of successful requests."), + VDS_FILESTOR_ALLTHREADS_REVERT_NOT_FOUND("vds.filestor.allthreads.revert.not_found", Unit.REQUEST, "Number of requests that could not be completed due to source document not found."), + VDS_FILESTOR_ALLTHREADS_STAT_BUCKET_COUNT("vds.filestor.allthreads.stat_bucket.count", Unit.REQUEST, "Number of requests processed."), + VDS_FILESTOR_ALLTHREADS_STAT_BUCKET_FAILED("vds.filestor.allthreads.stat_bucket.failed", Unit.REQUEST, "Number of failed requests."), + VDS_FILESTOR_ALLTHREADS_STAT_BUCKET_LATENCY("vds.filestor.allthreads.stat_bucket.latency", Unit.REQUEST, "Latency of successful requests."), + VDS_FILESTOR_BUCKET_DB_INIT_LATENCY("vds.filestor.bucket_db_init_latency", Unit.MILLISECOND, "Time taken (in ms) to initialize bucket databases with information from the persistence provider"), + VDS_FILESTOR_DIRECTORYEVENTS("vds.filestor.directoryevents", Unit.OPERATION, "Number of directory events received."), + VDS_FILESTOR_DISKEVENTS("vds.filestor.diskevents", Unit.OPERATION, "Number of disk events received."), + VDS_FILESTOR_PARTITIONEVENTS("vds.filestor.partitionevents", Unit.OPERATION, "Number of partition events received."), + VDS_FILESTOR_PENDINGMERGE("vds.filestor.pendingmerge", Unit.BUCKET, "Number of buckets currently being merged."), + VDS_FILESTOR_WAITINGFORLOCKRATE("vds.filestor.waitingforlockrate", Unit.OPERATION, "Amount of times a filestor thread has needed to wait for lock to take next message in queue."), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_ABORTED("vds.mergethrottler.mergechains.failures.aborted", Unit.OPERATION, "The number of merges that failed because the storage node was (most likely) shutting down"), + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_BUCKETNOTFOUND("vds.mergethrottler.mergechains.failures.bucketnotfound", Unit.OPERATION, "The number of operations that failed because the bucket did not exist"), + VDS_SERVER_MEMORYUSAGE("vds.server.memoryusage", Unit.BYTE, "Amount of memory used by the storage subsystem"), + VDS_SERVER_MEMORYUSAGE_VISITING("vds.server.memoryusage_visiting", Unit.BYTE, "Message use from visiting"), + VDS_SERVER_MESSAGE_MEMORY_USE_HIGHPRI("vds.server.message_memory_use.highpri", Unit.BYTE, "Message use from high priority storage messages"), + VDS_SERVER_MESSAGE_MEMORY_USE_LOWPRI("vds.server.message_memory_use.lowpri", Unit.BYTE, "Message use from low priority storage messages"), + VDS_SERVER_MESSAGE_MEMORY_USE_NORMALPRI("vds.server.message_memory_use.normalpri", Unit.BYTE, "Message use from normal priority storage messages"), + VDS_SERVER_MESSAGE_MEMORY_USE_TOTAL("vds.server.message_memory_use.total", Unit.BYTE, "Message use from storage messages"), + VDS_SERVER_MESSAGE_MEMORY_USE_VERYHIGHPRI("vds.server.message_memory_use.veryhighpri", Unit.BYTE, "Message use from very high priority storage messages"), + VDS_STATE_MANAGER_INVOKE_STATE_LISTENERS_LATENCY("vds.state_manager.invoke_state_listeners_latency", Unit.MILLISECOND, "Time spent (in ms) propagating state changes to internal state listeners"), + VDS_VISITOR_CV_QUEUEEVICTEDWAITTIME("vds.visitor.cv_queueevictedwaittime", Unit.MILLISECOND, "Milliseconds waiting in create visitor queue, for visitors that was evicted from queue due to higher priority visitors coming"), + VDS_VISITOR_CV_QUEUEFULL("vds.visitor.cv_queuefull", Unit.OPERATION, "Number of create visitor messages failed as queue is full"), + VDS_VISITOR_CV_QUEUESIZE("vds.visitor.cv_queuesize", Unit.ITEM, "Size of create visitor queue"), + VDS_VISITOR_CV_QUEUETIMEOUTWAITTIME("vds.visitor.cv_queuetimeoutwaittime", Unit.MILLISECOND, "Milliseconds waiting in create visitor queue, for visitors that timed out while in the visitor quueue"), + VDS_VISITOR_CV_QUEUEWAITTIME("vds.visitor.cv_queuewaittime", Unit.MILLISECOND, "Milliseconds waiting in create visitor queue, for visitors that was added to visitor queue but scheduled later"), + VDS_VISITOR_CV_SKIPQUEUE("vds.visitor.cv_skipqueue", Unit.OPERATION, "Number of times we could skip queue as we had free visitor spots"), + + // C++ capability metrics + VDS_SERVER_NETWORK_RPC_CAPABILITY_CHECKS_FAILED("vds.server.network.rpc-capability-checks-failed", Unit.FAILURE, "Number of RPC operations that failed to due one or more missing capabilities"), + VDS_SERVER_NETWORK_STATUS_CAPABILITY_CHECKS_FAILED("vds.server.network.status-capability-checks-failed", Unit.FAILURE, "Number of status page operations that failed to due one or more missing capabilities"), + + // C++ Fnet metrics + VDS_SERVER_FNET_NUM_CONNECTIONS("vds.server.fnet.num-connections", Unit.CONNECTION, "Total number of connection objects"); + + + private final String name; + private final Unit unit; + private final String description; + + StorageMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/Suffix.java b/metrics/src/main/java/ai/vespa/metrics/Suffix.java new file mode 100644 index 00000000000..ce5e0aaa602 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/Suffix.java @@ -0,0 +1,24 @@ +package ai.vespa.metrics; + +public enum Suffix { + ninety_five_percentile("95percentile"), + ninety_nine_percentile("99percentile"), + average("average"), + count("count"), + last("last"), + max("max"), + min("min"), + rate("rate"), + sum("sum"); + + private final String suffix; + + Suffix(String suffix) { + this.suffix = suffix; + } + + public String suffix() { + return suffix; + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/Unit.java b/metrics/src/main/java/ai/vespa/metrics/Unit.java new file mode 100644 index 00000000000..a2123d72246 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/Unit.java @@ -0,0 +1,129 @@ +package ai.vespa.metrics; + +/** + * @author gjoranv + */ +public enum Unit { + + BINARY(BaseUnit.BINARY, "Zero or one. Zero typically indicate \"false\" while one indicate \"true\""), + BUCKET(BaseUnit.BUCKET, "A chunk of documents managed by a distributor service"), + BYTE(BaseUnit.BYTE, "A collection of 8 bits"), + BYTE_PER_SECOND(BaseUnit.BYTE, BaseUnit.SECOND, "A unit of storage capable of holding 8 bits"), + CONNECTION(BaseUnit.CONNECTION, "A link used for communication between a client and a server"), + DOCUMENT(BaseUnit.DOCUMENT, "Vespa document, a collection of fields defined in a schema file"), + DOCUMENTID(BaseUnit.DOCUMENTID, "A unique document identifier"), + FAILURE(BaseUnit.FAILURE, "Failures, typically for requests, operations or nodes"), + FILE(BaseUnit.FILE, "Data file stored on the disk on a node"), + FRACTION(BaseUnit.FRACTION, "A value in the range [0..1]. Higher values can occur for some metrics, but would indicate the value is outside of the allowed range."), + HIT(BaseUnit.HIT, "Document that meets the filtering/restriction criteria specified by a given query"), + HIT_PER_QUERY(BaseUnit.HIT, BaseUnit.QUERY, "Number of hits per query over a period of time"), + INSTANCE(BaseUnit.INSTANCE, "Typically tenant or application"), + ITEM(BaseUnit.ITEM, "Object or unit maintained in e.g. a queue"), + MILLISECOND(BaseUnit.MILLISECOND, "Millisecond, 1/1000 of a second"), + NANOSECOND(BaseUnit.NANOSECOND, "Nanosecond, 1/1000.000.000 of a second"), + NODE(BaseUnit.NODE, "(Virtual) computer that is part of a Vespa cluster"), + PACKET(BaseUnit.PACKET, "Collection of data transmitted over the network as a single unit"), + OPERATION(BaseUnit.OPERATION, "A clearly defined task"), + OPERATION_PER_SECOND(BaseUnit.OPERATION, BaseUnit.SECOND, "Number of operations per second"), + PERCENTAGE(BaseUnit.PERCENTAGE, "A number expressed as a fraction of 100. Typically in the range [0..100]."), + QUERY(BaseUnit.QUERY, "A request for matching, grouping and/or scoring documents stored in Vespa"), + QUERY_PER_SECOND(BaseUnit.QUERY, BaseUnit.SECOND, "Number of queries per second."), + RECORD(BaseUnit.RECORD, "A collection of information, typically a set of key/value, e.g. stored in a transaction log"), + REQUEST(BaseUnit.REQUEST, "A request sent from a client to a server"), + RESPONSE(BaseUnit.RESPONSE, "A response from a server to a client, typically as a response to a request"), + RESTART(BaseUnit.RESTART, "A service or node restarts"), + SCORE(BaseUnit.SCORE, "Relevance score for a document"), + SECOND(BaseUnit.SECOND, "Time span of 1 second"), + SESSION(BaseUnit.SESSION, "A set of operations taking place during one connection or as part of a higher level operation"), + TASK(BaseUnit.TASK, "Piece of work executed by a server, e.g. to perform back-ground data maintenance"), + THREAD(BaseUnit.THREAD, "Computer thread for executing e.g. tasks, operations or queries"), + VERSION(BaseUnit.VERSION, "Software or config version"), + WAKEUP(BaseUnit.WAKEUP, "Computer thread wake-ups for doing some work"); + + + private final BaseUnit unit; + private final BaseUnit perUnit; + private final String description; + + Unit(BaseUnit unit, String description) { + this(unit, null, description); + } + + Unit(BaseUnit unit, BaseUnit perUnit, String description) { + this.unit = unit; + this.perUnit = perUnit; + this.description = description; + } + + public String fullName() { + return perUnit == null ? + unit.fullName() : + unit.fullName() + "/" + perUnit.fullName(); + } + + public String shortName() { + return perUnit == null ? + unit.shortName : + unit.shortName + "/" + perUnit.shortName; + } + + public String getDescription() { + return description; + } + + private enum BaseUnit { + + AREA("area"), + BINARY("binary"), + BUCKET("bucket"), + BYTE("byte"), + CONNECTION("connection"), + DOCUMENT("document"), + DOCUMENTID("documentid"), + FAILURE("failure"), + FILE("file"), + FRACTION("fraction"), + HIT("hit"), + INSTANCE("instance"), + ITEM("item"), + MILLISECOND("millisecond", "ms"), + NANOSECOND("nanosecond", "ns"), + NODE("node"), + OPERATION("operation"), + PACKET("packet"), + PERCENTAGE("percentage"), + QUERY("query"), + RECORD("record"), + REQUEST("request"), + RESPONSE("response"), + RESTART("restart"), + SCORE("score"), + SECOND("second", "s"), + SESSION("session"), + TASK("task"), + THREAD("thread"), + VERSION("version"), + WAKEUP("wakeup"); + + private final String fullName; + private final String shortName; + + BaseUnit(String fullName) { + this(fullName, fullName); + } + + BaseUnit(String fullName, String shortName) { + this.fullName = fullName; + this.shortName = shortName; + } + + public String fullName() { + return fullName; + } + + public String shortName() { + return shortName; + } + + } +} diff --git a/metrics/src/main/java/ai/vespa/metrics/VespaMetrics.java b/metrics/src/main/java/ai/vespa/metrics/VespaMetrics.java new file mode 100644 index 00000000000..3a17d8a3155 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/VespaMetrics.java @@ -0,0 +1,56 @@ +package ai.vespa.metrics; + +/** + * @author gjoranv + */ +public interface VespaMetrics { + + String baseName(); + Unit unit(); + String description(); + + default String descriptionWitUnit() { + return description() + " (unit: " + unit().shortName() + ")"; + } + + private String withSuffix(Suffix suffix) { + return baseName() + "." + suffix.suffix(); + } + + default String ninety_five_percentile() { + return withSuffix(Suffix.ninety_five_percentile); + } + + default String ninety_nine_percentile() { + return withSuffix(Suffix.ninety_nine_percentile); + } + + default String average() { + return withSuffix(Suffix.average); + } + + default String count() { + return withSuffix(Suffix.count); + } + + default String last() { + return withSuffix(Suffix.last); + } + + default String max() { + return withSuffix(Suffix.max); + } + + default String min() { + return withSuffix(Suffix.min); + } + + default String rate() { + return withSuffix(Suffix.rate); + } + + default String sum() { + return withSuffix(Suffix.sum); + } + +} diff --git a/metrics/src/main/java/ai/vespa/metrics/package-info.java b/metrics/src/main/java/ai/vespa/metrics/package-info.java new file mode 100644 index 00000000000..98986f61dc5 --- /dev/null +++ b/metrics/src/main/java/ai/vespa/metrics/package-info.java @@ -0,0 +1,5 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +package ai.vespa.metrics; + +import com.yahoo.osgi.annotation.ExportPackage; |