diff options
14 files changed, 410 insertions, 104 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/AutoscalingMetrics.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/AutoscalingMetrics.java index 9cb8b4e00ca..30810d428c1 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/AutoscalingMetrics.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/AutoscalingMetrics.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.model.admin.monitoring; import com.yahoo.metrics.ContainerMetrics; import com.yahoo.metrics.SearchNodeMetrics; import com.yahoo.metrics.StorageMetrics; +import com.yahoo.metrics.HostedNodeAdminMetrics; import java.util.ArrayList; import java.util.LinkedHashSet; import java.util.List; @@ -21,19 +22,19 @@ public class AutoscalingMetrics { private static MetricSet create() { List<String> metrics = new ArrayList<>(); - metrics.add("cpu.util"); + metrics.add(HostedNodeAdminMetrics.CPU_UTIL.baseName()); // Memory util - metrics.add("mem.util"); // node level - default + metrics.add(HostedNodeAdminMetrics.MEM_UTIL.baseName()); // node level - default metrics.add(SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average()); // better for content as it is the basis for blocking // Disk util - metrics.add("disk.util"); // node level -default + metrics.add(HostedNodeAdminMetrics.DISK_UTIL.baseName()); // node level -default metrics.add(SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_DISK.average()); // better for content as it is the basis for blocking metrics.add(ContainerMetrics.APPLICATION_GENERATION.baseName()); - metrics.add("in_service"); + metrics.add(ContainerMetrics.IN_SERVICE.baseName()); // Query rate metrics.add(ContainerMetrics.QUERIES.rate()); // container diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/NetworkMetrics.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/NetworkMetrics.java index 2f9c97f0488..839dcad64ee 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/NetworkMetrics.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/NetworkMetrics.java @@ -1,6 +1,8 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.model.admin.monitoring; +import com.yahoo.metrics.HostedNodeAdminMetrics; + import com.google.common.collect.ImmutableSet; import java.util.Set; @@ -8,18 +10,21 @@ import java.util.Set; /** * @author gjoranv */ + +// TODO: Move to hosted repo. public class NetworkMetrics { public static final MetricSet networkMetricSet = createNetworkMetricSet(); private static MetricSet createNetworkMetricSet() { Set<Metric> dockerNetworkMetrics = - ImmutableSet.of(new Metric("net.in.bytes"), - new Metric("net.in.errors"), - new Metric("net.in.dropped"), - new Metric("net.out.bytes"), - new Metric("net.out.errors"), - new Metric("net.out.dropped") + ImmutableSet.of(new Metric(HostedNodeAdminMetrics.NET_IN_BYTES.baseName()), + new Metric(HostedNodeAdminMetrics.NET_IN_ERROR.baseName()), + new Metric(HostedNodeAdminMetrics.NET_IN_DROPPED.baseName()), + new Metric(HostedNodeAdminMetrics.NET_OUT_BYTES.baseName()), + new Metric(HostedNodeAdminMetrics.NET_OUT_ERROR.baseName()), + new Metric(HostedNodeAdminMetrics.NET_OUT_DROPPED.baseName()), + new Metric(HostedNodeAdminMetrics.BANDWIDTH_LIMIT.baseName()) ); return new MetricSet("network", dockerNetworkMetrics); diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java index 0958a3f3908..eee6be9af93 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java @@ -1,6 +1,8 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.model.admin.monitoring; +import com.yahoo.metrics.HostedNodeAdminMetrics; + import com.google.common.collect.ImmutableSet; import java.util.Set; @@ -8,56 +10,39 @@ import java.util.Set; /** * @author gjoranv */ -public class SystemMetrics { +// TODO: Move to hosted repo. - public static final String CPU_UTIL = "cpu.util"; - public static final String CPU_SYS_UTIL = "cpu.sys.util"; - public static final String CPU_THROTTLED_TIME = "cpu.throttled_time.rate"; - public static final String CPU_THROTTLED_CPU_TIME = "cpu.throttled_cpu_time.rate"; - public static final String CPU_VCPUS = "cpu.vcpus"; - public static final String DISK_LIMIT = "disk.limit"; - public static final String DISK_USED = "disk.used"; - public static final String DISK_UTIL = "disk.util"; - public static final String MEM_LIMIT = "mem.limit"; - public static final String MEM_USED = "mem.used"; - public static final String MEM_UTIL = "mem.util"; - public static final String MEM_TOTAL_USED = "mem_total.used"; - public static final String MEM_TOTAL_UTIL = "mem_total.util"; - public static final String BANDWIDTH_LIMIT = "bandwidth.limit"; - public static final String GPU_UTIL = "gpu.util"; - public static final String GPU_MEM_USED = "gpu.memory.used"; - public static final String GPU_MEM_TOTAL = "gpu.memory.total"; +public class SystemMetrics { public static final MetricSet systemMetricSet = createSystemMetricSet(); private static MetricSet createSystemMetricSet() { Set<Metric> dockerNodeMetrics = - ImmutableSet.of(new Metric(CPU_UTIL), - new Metric(CPU_SYS_UTIL), - new Metric(CPU_THROTTLED_TIME), - new Metric(CPU_THROTTLED_CPU_TIME), - new Metric(CPU_VCPUS), - new Metric(DISK_LIMIT), - new Metric(DISK_USED), - new Metric(DISK_UTIL), - new Metric(MEM_LIMIT), - new Metric(MEM_USED), - new Metric(MEM_UTIL), - new Metric(MEM_TOTAL_USED), - new Metric(MEM_TOTAL_UTIL), - new Metric(BANDWIDTH_LIMIT), - new Metric(GPU_UTIL), - new Metric(GPU_MEM_USED), - new Metric(GPU_MEM_TOTAL) + ImmutableSet.of(new Metric(HostedNodeAdminMetrics.CPU_UTIL.baseName()), + new Metric(HostedNodeAdminMetrics.CPU_SYS_UTIL.baseName()), + new Metric(HostedNodeAdminMetrics.CPU_THROTTLED_TIME.baseName()), + new Metric(HostedNodeAdminMetrics.CPU_THROTTLED_CPU_TIME.baseName()), + new Metric(HostedNodeAdminMetrics.CPU_VCPUS.baseName()), + new Metric(HostedNodeAdminMetrics.DISK_LIMIT.baseName()), + new Metric(HostedNodeAdminMetrics.DISK_USED.baseName()), + new Metric(HostedNodeAdminMetrics.DISK_UTIL.baseName()), + new Metric(HostedNodeAdminMetrics.MEM_LIMIT.baseName()), + new Metric(HostedNodeAdminMetrics.MEM_USED.baseName()), + new Metric(HostedNodeAdminMetrics.MEM_UTIL.baseName()), + new Metric(HostedNodeAdminMetrics.MEM_TOTAL_USED.baseName()), + new Metric(HostedNodeAdminMetrics.MEM_TOTAL_UTIL.baseName()), + new Metric(HostedNodeAdminMetrics.GPU_UTIL.baseName()), + new Metric(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()), + new Metric(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()) ); Set<Metric> nonDockerNodeMetrics = // Disk metrics should be based on /home, or else '/' - or simply add filesystem as dimension - ImmutableSet.of(new Metric("cpu.busy.pct", CPU_UTIL), - new Metric("mem.used.pct", MEM_UTIL), - new Metric("mem.active.kb", MEM_USED), - new Metric("mem.total.kb", MEM_LIMIT), - new Metric("used.kb", DISK_USED) + ImmutableSet.of(new Metric("cpu.busy.pct", HostedNodeAdminMetrics.CPU_UTIL.baseName()), + new Metric("mem.used.pct", HostedNodeAdminMetrics.MEM_UTIL.baseName()), + new Metric("mem.active.kb", HostedNodeAdminMetrics.MEM_USED.baseName()), + new Metric("mem.total.kb", HostedNodeAdminMetrics.MEM_LIMIT.baseName()), + new Metric("used.kb", HostedNodeAdminMetrics.DISK_USED.baseName()) ); Set<Metric> systemMetrics = ImmutableSet.<Metric>builder() diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java index 000ac92a8a0..16a5a501b0e 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/VespaMetricSet.java @@ -1,10 +1,15 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.model.admin.monitoring; +import com.yahoo.metrics.ConfigServerMetrics; import com.yahoo.metrics.ContainerMetrics; import com.yahoo.metrics.DistributorMetrics; +import com.yahoo.metrics.LogdMetrics; import com.yahoo.metrics.SearchNodeMetrics; +import com.yahoo.metrics.SentinelMetrics; +import com.yahoo.metrics.SlobrokMetrics; import com.yahoo.metrics.StorageMetrics; +import com.yahoo.metrics.NodeAdminMetrics; import com.yahoo.metrics.Suffix; import java.util.Collections; @@ -55,12 +60,10 @@ public class VespaMetricSet { private static Set<Metric> getSentinelMetrics() { Set<Metric> metrics = new LinkedHashSet<>(); - addMetric(metrics, "sentinel.restarts.count"); - addMetric(metrics, "sentinel.totalRestarts.last"); - addMetric(metrics, "sentinel.uptime.last"); - - addMetric(metrics, "sentinel.running.count"); - addMetric(metrics, "sentinel.running.last"); + addMetric(metrics, SentinelMetrics.SENTINEL_RESTARTS.count()); + addMetric(metrics, SentinelMetrics.SENTINEL_TOTAL_RESTARTS.last()); + addMetric(metrics, SentinelMetrics.SENTINEL_UPTIME.last()); + addMetric(metrics, SentinelMetrics.SENTINEL_RUNNING, EnumSet.of(count, last)); return metrics; } @@ -68,39 +71,39 @@ public class VespaMetricSet { private static Set<Metric> getOtherMetrics() { Set<Metric> metrics = new LinkedHashSet<>(); - addMetric(metrics, "slobrok.heartbeats.failed.count"); - addMetric(metrics, "slobrok.missing.consensus.count"); + addMetric(metrics, SlobrokMetrics.SLOBROK_HEARTBEATS_FAILED.count()); + addMetric(metrics, SlobrokMetrics.SLOBROK_MISSING_CONSENSUS.count()); - addMetric(metrics, "logd.processed.lines.count"); - addMetric(metrics, "worker.connections.max"); - addMetric(metrics, "endpoint.certificate.expiry.seconds"); + addMetric(metrics, LogdMetrics.LOGD_PROCESSED_LINES.count()); // Java (JRT) TLS metrics - addMetric(metrics, "jrt.transport.tls-certificate-verification-failures"); - addMetric(metrics, "jrt.transport.peer-authorization-failures"); - addMetric(metrics, "jrt.transport.server.tls-connections-established"); - addMetric(metrics, "jrt.transport.client.tls-connections-established"); - addMetric(metrics, "jrt.transport.server.unencrypted-connections-established"); - addMetric(metrics, "jrt.transport.client.unencrypted-connections-established"); + addMetric(metrics, ContainerMetrics.JRT_TRANSPORT_TLS_CERTIFICATE_VERIFICATION_FAILURES.baseName()); + addMetric(metrics, ContainerMetrics.JRT_TRANSPORT_PEER_AUTHORIZATION_FAILURES.baseName()); + addMetric(metrics, ContainerMetrics.JRT_TRANSPORT_SERVER_TLS_CONNECIONTS_ESTABLISHED.baseName()); + addMetric(metrics, ContainerMetrics.JRT_TRANSPORT_CLIENT_TLS_CONNECTIONS_ESTABLISHED.baseName()); + addMetric(metrics, ContainerMetrics.JRT_TRANSPORT_SERVER_UNENCRYPTED_CONNECTIONS_ESTABLISHED.baseName()); + addMetric(metrics, ContainerMetrics. JRT_TRANSPORT_CLIENT_UNENCRYPTED_CONNECTIONS_ESTABLISHED. baseName()); // C++ TLS metrics - addMetric(metrics, "vds.server.network.tls-handshakes-failed"); - addMetric(metrics, "vds.server.network.peer-authorization-failures"); - addMetric(metrics, "vds.server.network.client.tls-connections-established"); - addMetric(metrics, "vds.server.network.server.tls-connections-established"); - addMetric(metrics, "vds.server.network.client.insecure-connections-established"); - addMetric(metrics, "vds.server.network.server.insecure-connections-established"); - addMetric(metrics, "vds.server.network.tls-connections-broken"); - addMetric(metrics, "vds.server.network.failed-tls-config-reloads"); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_TLS_HANDSHAKES_FAILED.count()); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_PEER_AUTHORIZATION_FAILURES.count()); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_CLIENT_TLS_CONNECTIONS_ESTABLISHED.count()); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_SERVER_TLS_CONNECTIONS_ESTABLISHED.count()); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_CLIENT_INSECURE_CONNECTIONS_ESTABLISHED.count()); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_SERVER_INSECURE_CONNECTIONS_ESTABLISHED.count()); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_TLS_CONNECTIONS_BROKEN.count()); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_FAILED_TLS_CONFIG_RELOADS.count()); // C++ capability metrics - addMetric(metrics, "vds.server.network.rpc-capability-checks-failed"); - addMetric(metrics, "vds.server.network.status-capability-checks-failed"); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_RPC_CAPABILITY_CHECKS_FAILED.count()); + addMetric(metrics, StorageMetrics.VDS_SERVER_NETWORK_STATUS_CAPABILITY_CHECKS_FAILED.count()); // C++ Fnet metrics - addMetric(metrics, "vds.server.fnet.num-connections"); + addMetric(metrics, StorageMetrics.VDS_SERVER_FNET_NUM_CONNECTIONS.count()); - // Node certificate - addMetric(metrics, "node-certificate.expiry.seconds"); + // NodeAdmin certificate + addMetric(metrics, NodeAdminMetrics.WORKER_CONNECTIONS.max()); // Hosted Vespa only (routing layer) + addMetric(metrics, NodeAdminMetrics.ENDPOINT_CERTIFICATE_EXPIRY_SECONDS.baseName()); + addMetric(metrics, NodeAdminMetrics.NODE_CERTIFICATE_EXPIRY_SECONDS.baseName()); return metrics; } @@ -108,22 +111,20 @@ public class VespaMetricSet { private static Set<Metric> getConfigServerMetrics() { Set<Metric> metrics =new LinkedHashSet<>(); - addMetric(metrics, "configserver.requests.count"); - addMetric(metrics, "configserver.failedRequests.count"); - addMetric(metrics, "configserver.latency.max"); - addMetric(metrics, "configserver.latency.sum"); - addMetric(metrics, "configserver.latency.count"); - addMetric(metrics, "configserver.cacheConfigElems.last"); - addMetric(metrics, "configserver.cacheChecksumElems.last"); - addMetric(metrics, "configserver.hosts.last"); - addMetric(metrics, "configserver.delayedResponses.count"); - addMetric(metrics, "configserver.sessionChangeErrors.count"); - - addMetric(metrics, "configserver.zkZNodes.last"); - addMetric(metrics, "configserver.zkAvgLatency.last"); - addMetric(metrics, "configserver.zkMaxLatency.last"); - addMetric(metrics, "configserver.zkConnections.last"); - addMetric(metrics, "configserver.zkOutstandingRequests.last"); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_REQUESTS.count()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_FAILED_REQUESTS.count()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_LATENCY, EnumSet.of(max, sum, count)); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_CACHE_CONFIG_ELEMS.last()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_CACHE_CHECKSUM_ELEMS.last()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_HOSTS.last()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_DELAYED_RESPONSES.count()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_SESSION_CHANGE_ERRORS.count()); + + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_ZK_Z_NODES.last()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_ZK_AVG_LATENCY.last()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_ZK_MAX_LATENCY.last()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_ZK_CONNECTIONS.last()); + addMetric(metrics, ConfigServerMetrics.CONFIGSERVER_ZK_OUTSTANDING_REQUESTS.last()); return metrics; } @@ -721,6 +722,18 @@ public class VespaMetricSet { private static void addMetric(Set<Metric> metrics, DistributorMetrics metric, EnumSet<Suffix> suffixes) { suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix()))); } + private static void addMetric(Set<Metric> metrics, SentinelMetrics metric, EnumSet<Suffix> suffixes) { + suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix()))); + } + private static void addMetric(Set<Metric> metrics, SlobrokMetrics metric, EnumSet<Suffix> suffixes) { + suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix()))); + } + private static void addMetric(Set<Metric> metrics, LogdMetrics metric, EnumSet<Suffix> suffixes) { + suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix()))); + } + private static void addMetric(Set<Metric> metrics, ConfigServerMetrics metric, EnumSet<Suffix> suffixes) { + suffixes.forEach(suffix -> metrics.add(new Metric(metric.baseName() + "." + suffix.suffix()))); + } private static void addMetric(Set<Metric> metrics, String metricName, Iterable<String> aggregateSuffices) { for (String suffix : aggregateSuffices) { metrics.add(new Metric(metricName + "." + suffix)); diff --git a/container-core/src/main/java/com/yahoo/metrics/ConfigServerMetrics.java b/container-core/src/main/java/com/yahoo/metrics/ConfigServerMetrics.java new file mode 100644 index 00000000000..62da9cec055 --- /dev/null +++ b/container-core/src/main/java/com/yahoo/metrics/ConfigServerMetrics.java @@ -0,0 +1,60 @@ +package com.yahoo.metrics; + +/** + * @author yngveaasheim + */ +public enum ConfigServerMetrics implements VespaMetrics { + + CONFIGSERVER_REQUESTS("configserver.requests", Unit.REQUEST, "Number of requests processed"), + CONFIGSERVER_FAILED_REQUESTS("configserver.failedRequests", Unit.REQUEST, "Number of requests that failed"), + CONFIGSERVER_LATENCY("configserver.latency", Unit.MILLISECOND, "Time to complete requests"), + CONFIGSERVER_CACHE_CONFIG_ELEMS("configserver.cacheConfigElems", Unit.ITEM, "Time to complete requests"), + CONFIGSERVER_CACHE_CHECKSUM_ELEMS("", Unit.ITEM, "Number of checksum elements in the cache"), + CONFIGSERVER_HOSTS("configserver.hosts", Unit.NODE, "The number of nodes being served configuration from the config server cluster"), + CONFIGSERVER_TENANTS("configserver.tenants", Unit.INSTANCE, "The number of tenants being served configuration from the config server cluster"), + CONFIGSERVER_APPLICATIONS("configserver.applications", Unit.INSTANCE, "The number of applications being served configuration from the config server cluster"), + CONFIGSERVER_DELAYED_RESPONSES("configserver.delayedResponses", Unit.RESPONSE, "Number of delayed responses"), + CONFIGSERVER_SESSION_CHANGE_ERRORS("configserver.sessionChangeErrors", Unit.SESSION, "Number of session change errors"), + CONFIGSERVER_UNKNOWN_HOST_REQUEST("configserver.unknownHostRequests", Unit.REQUEST, "Config requests from unknown hosts"), + CONFIGSERVER_NEW_SESSIONS("configserver.newSessions", Unit.SESSION, "New config sessions"), + CONFIGSERVER_PREPARED_SESSIONS("configserver.preparedSessions", Unit.SESSION, "Prepared config sessions"), + CONFIGSERVER_ACTIVE_SESSIONS("configserver.activeSessions", Unit.SESSION, "Active config sessions"), + CONFIGSERVER_INACTIVE_SESSIONS("configserver.inactiveSessions", Unit.SESSION, "Inactive config sessions"), + CONFIGSERVER_ADDED_SESSIONS("configserver.addedSessions", Unit.SESSION, "Added config sessions"), + CONFIGSERVER_REMOVED_SESSIONS("configserver.removedSessions", Unit.SESSION, "Removed config sessions"), + CONFIGSERVER_RPC_SERVER_WORK_QUEUE_SIZE("configserver.rpcServerWorkQueueSize", Unit.ITEM, "Number of elements in the RPC server work queue"), + + // ZooKeeper related metrics + CONFIGSERVER_ZK_CONNECTIONS_LOST("configserver.zkConnectionLost", Unit.CONNECTION, "Number of ZooKeeper connections lost"), + CONFIGSERVER_ZK_RECONNECTED("configserver.zkReconnected", Unit.CONNECTION, "Number of ZooKeeper reconnections"), + CONFIGSERVER_ZK_CONNECTED("configserver.zkConnected", Unit.NODE, "Number of ZooKeeper nodes connected"), + CONFIGSERVER_ZK_SUSPENDED("configserver.zkSuspended", Unit.NODE, "Number of ZooKeeper nodes suspended"), + CONFIGSERVER_ZK_Z_NODES("configserver.zkZNodes", Unit.NODE, "Number of ZooKeeper nodes present"), + CONFIGSERVER_ZK_AVG_LATENCY("configserver.zkAvgLatency", Unit.MILLISECOND, "Average latency for ZooKeeper requests"), // TODO: Confirm metric name + CONFIGSERVER_ZK_MAX_LATENCY("configserver.zkMaxLatency", Unit.MILLISECOND, "Max latency for ZooKeeper requests"), + CONFIGSERVER_ZK_CONNECTIONS("configserver.zkConnections", Unit.CONNECTION, "Number of ZooKeeper connections"), + CONFIGSERVER_ZK_OUTSTANDING_REQUESTS("configserver.zkOutstandingRequests", Unit.REQUEST, "Number of ZooKeeper requests in flight"); + + private final String name; + private final Unit unit; + private final String description; + + ConfigServerMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +}
\ No newline at end of file diff --git a/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java b/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java index 0b61c937cb8..ed1d6f7a001 100644 --- a/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java +++ b/container-core/src/main/java/com/yahoo/metrics/ContainerMetrics.java @@ -12,6 +12,7 @@ public enum ContainerMetrics implements VespaMetrics { HTTP_STATUS_5XX("http.status.5xx", Unit.RESPONSE, "Number of responses with a 5xx status"), APPLICATION_GENERATION("application_generation", Unit.VERSION, "The currently live application config generation (aka session id)"), + IN_SERVICE("in_service", Unit.BINARY, "This will have the value 1 if the node is in service, 0 if not."), JDISC_GC_COUNT("jdisc.gc.count", Unit.OPERATION, "Number of JVM garbage collections done"), JDISC_GC_MS("jdisc.gc.ms", Unit.MILLISECOND, "Time spent in JVM garbage collection"), @@ -184,7 +185,15 @@ public enum ContainerMetrics implements VespaMetrics { CLUSTER_CONTROLLER_RESOURCE_USAGE_MAX_DISK_UTILIZATION("cluster-controller.resource_usage.max_disk_utilization", Unit.FRACTION, "Current disk space utilisation, per content node"), CLUSTER_CONTROLLER_RESOURCE_USAGE_MEMORY_LIMIT("cluster-controller.resource_usage.memory_limit", Unit.FRACTION, "Disk space limit as a fraction of available disk space"), CLUSTER_CONTROLLER_RESOURCE_USAGE_DISK_LIMIT("cluster-controller.resource_usage.disk_limit", Unit.FRACTION, "Memory space limit as a fraction of available memory"), - CLUSTER_CONTROLLER_REINDEXING_PROGRESS("reindexing.progress", Unit.FRACTION, "Re-indexing progress"); + CLUSTER_CONTROLLER_REINDEXING_PROGRESS("reindexing.progress", Unit.FRACTION, "Re-indexing progress"), + + // Java (JRT) TLS metrics + JRT_TRANSPORT_TLS_CERTIFICATE_VERIFICATION_FAILURES("jrt.transport.tls-certificate-verification-failures", Unit.FAILURE, "TLS certificate verification failures"), + JRT_TRANSPORT_PEER_AUTHORIZATION_FAILURES("jrt.transport.peer-authorization-failures", Unit.FAILURE, "TLS peer authorization failures"), + JRT_TRANSPORT_SERVER_TLS_CONNECIONTS_ESTABLISHED("jrt.transport.server.tls-connections-established", Unit.CONNECTION, "TLS server connections established"), + JRT_TRANSPORT_CLIENT_TLS_CONNECTIONS_ESTABLISHED("jrt.transport.client.tls-connections-established", Unit.CONNECTION, "TLS client connections established"), + JRT_TRANSPORT_SERVER_UNENCRYPTED_CONNECTIONS_ESTABLISHED("jrt.transport.server.unencrypted-connections-established", Unit.CONNECTION, "Unencrypted server connections established"), + JRT_TRANSPORT_CLIENT_UNENCRYPTED_CONNECTIONS_ESTABLISHED("jrt.transport.client.unencrypted-connections-established", Unit.CONNECTION, "Unencrypted client connections established"); private final String name; diff --git a/container-core/src/main/java/com/yahoo/metrics/HostedNodeAdminMetrics.java b/container-core/src/main/java/com/yahoo/metrics/HostedNodeAdminMetrics.java new file mode 100644 index 00000000000..5624f1f92e3 --- /dev/null +++ b/container-core/src/main/java/com/yahoo/metrics/HostedNodeAdminMetrics.java @@ -0,0 +1,61 @@ +package com.yahoo.metrics; + +/** + * @author yngveaasheim + */ + +// TODO: Move to hosted repo. +public enum HostedNodeAdminMetrics implements VespaMetrics { + + // System metrics + CPU_UTIL("cpu.util", Unit.FRACTION, "CPU utilisation"), + CPU_SYS_UTIL("cpu.sys.util", Unit.FRACTION, "System CPU utilisation"), + CPU_THROTTLED_TIME("cpu.throttled_time.rate", Unit.FRACTION, "Part of the time CPU is exhausted (CPU throttling enforced)"), + CPU_THROTTLED_CPU_TIME("cpu.throttled_cpu_time.rate", Unit.FRACTION, "Part of the time CPU is exhausted (CPU throttling enforced)"), + CPU_VCPUS("cpu.vcpus", Unit.ITEM, "Number of virtual CPU threads allocation to the node"), + DISK_LIMIT("disk.limit", Unit.BYTE, "Amount of disk space available on the node"), + DISK_USED("disk.used", Unit.BYTE, "Amount of disk space used by the node"), + DISK_UTIL("disk.util", Unit.FRACTION, "Disk space utilisation"), + MEM_LIMIT("mem.limit", Unit.BYTE, "Amount of memory available on the node"), + MEM_USED("mem.used", Unit.BYTE, "Amount of memory used by the node"), + MEM_UTIL("mem.util", Unit.FRACTION, "Memory utilisation"), + MEM_TOTAL_USED("mem_total.used", Unit.BYTE, "Total amount of memory used by the node, including OS buffer caches"), + MEM_TOTAL_UTIL("mem_total.util", Unit.FRACTION, "Total memory utilisation"), + GPU_UTIL("gpu.util", Unit.FRACTION, "GPU utilisation"), + GPU_MEM_USED("gpu.memory.used", Unit.BYTE, "GPU memory used"), + GPU_MEM_TOTAL("gpu.memory.total", Unit.BYTE, "GPU memory available"), + + + // Network metrics + NET_IN_BYTES("net.in.bytes", Unit.BYTE, "Network bytes received (rxBytes) (COUNT metric)"), + NET_IN_ERROR("net.in.errors", Unit.FAILURE, "Network receive errors (rxErrors)"), + NET_IN_DROPPED("net.in.dropped", Unit.PACKET, "Inbound network packets dropped (rxDropped)"), + NET_OUT_BYTES("net.in.bytes", Unit.BYTE, "Network bytes sent (txBytes) (COUNT metric)"), + NET_OUT_ERROR("net.in.errors", Unit.FAILURE, "Network send errors (txErrors)"), + NET_OUT_DROPPED("net.in.dropped", Unit.PACKET, "Outbound network packets dropped (txDropped)"), + BANDWIDTH_LIMIT("bandwidth.limit", Unit.BYTE_PER_SECOND, "Available network bandwidth"); + + private final String name; + private final Unit unit; + private final String description; + + HostedNodeAdminMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} + diff --git a/container-core/src/main/java/com/yahoo/metrics/LogdMetrics.java b/container-core/src/main/java/com/yahoo/metrics/LogdMetrics.java new file mode 100644 index 00000000000..3dae4283b9f --- /dev/null +++ b/container-core/src/main/java/com/yahoo/metrics/LogdMetrics.java @@ -0,0 +1,33 @@ +package com.yahoo.metrics; + +/** + * @author yngveaasheim + */ +public enum LogdMetrics implements VespaMetrics { + + LOGD_PROCESSED_LINES("logd.processed.lines", Unit.ITEM, "Number of log lines processed"); + + private final String name; + private final Unit unit; + private final String description; + + LogdMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} + diff --git a/container-core/src/main/java/com/yahoo/metrics/NodeAdminMetrics.java b/container-core/src/main/java/com/yahoo/metrics/NodeAdminMetrics.java new file mode 100644 index 00000000000..065ece33ecf --- /dev/null +++ b/container-core/src/main/java/com/yahoo/metrics/NodeAdminMetrics.java @@ -0,0 +1,36 @@ +package com.yahoo.metrics; + +/** + * @author yngveaasheim + */ +public enum NodeAdminMetrics implements VespaMetrics { + + WORKER_CONNECTIONS("worker.connections", Unit.CONNECTION, "Yahoo! Internal: Number of connections for the routing worker having most connections per node"), // Hosted Vespa only (routing layer) TODO: Move to a better place + ENDPOINT_CERTIFICATE_EXPIRY_SECONDS("endpoint.certificate.expiry.seconds", Unit.SECOND, "Time until node endpoint certificate expires"), + NODE_CERTIFICATE_EXPIRY_SECONDS("node-certificate.expiry.seconds", Unit.SECOND, "Time until node certificate expires"); + + + private final String name; + private final Unit unit; + private final String description; + + NodeAdminMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} + diff --git a/container-core/src/main/java/com/yahoo/metrics/SentinelMetrics.java b/container-core/src/main/java/com/yahoo/metrics/SentinelMetrics.java new file mode 100644 index 00000000000..7711b7e75f4 --- /dev/null +++ b/container-core/src/main/java/com/yahoo/metrics/SentinelMetrics.java @@ -0,0 +1,36 @@ +package com.yahoo.metrics; + +/** + * @author yngve + */ +public enum SentinelMetrics implements VespaMetrics { + + SENTINEL_RESTARTS("sentinel.restarts", Unit.RESTART, "Number of service restarts done by the sentinel"), + SENTINEL_TOTAL_RESTARTS("sentinel.totalRestarts", Unit.RESTART, "Total number of service restarts done by the sentinel since the sentinel was started"), + SENTINEL_UPTIME("sentinel.uptime", Unit.SECOND, "Time the sentinel has been running"), + SENTINEL_RUNNING("sentinel.running", Unit.INSTANCE, "Number of services the sentinel has running currently"); + + + private final String name; + private final Unit unit; + private final String description; + + SentinelMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/container-core/src/main/java/com/yahoo/metrics/SlobrokMetrics.java b/container-core/src/main/java/com/yahoo/metrics/SlobrokMetrics.java new file mode 100644 index 00000000000..8c30bf8e414 --- /dev/null +++ b/container-core/src/main/java/com/yahoo/metrics/SlobrokMetrics.java @@ -0,0 +1,37 @@ +package com.yahoo.metrics; + +/** + * @author yngve + */ +public enum SlobrokMetrics implements VespaMetrics { + + SLOBROK_HEARTBEATS_FAILED("slobrok.heartbeats.failed", Unit.REQUEST, "Number of heartbeat requests failed"), + SLOBROK_REQUESTS_REGISTER("slobrok.requests.register", Unit.REQUEST, "Number of register requests received"), + SLOBROK_REQUESTS_MIRROR("slobrok.requests.mirror", Unit.REQUEST, "Number of mirroring requests received"), + SLOBROK_REQUESTS_ADMIN("slobrok.requests.admin", Unit.REQUEST, "Number of administrative requests received"), + SLOBROK_MISSING_CONSENSUS("slobrok.missing.consensus", Unit.SECOND, "Number of seconds without full consensus with all other brokers"); + + + private final String name; + private final Unit unit; + private final String description; + + SlobrokMetrics(String name, Unit unit, String description) { + this.name = name; + this.unit = unit; + this.description = description; + } + + public String baseName() { + return name; + } + + public Unit unit() { + return unit; + } + + public String description() { + return description; + } + +} diff --git a/container-core/src/main/java/com/yahoo/metrics/StorageMetrics.java b/container-core/src/main/java/com/yahoo/metrics/StorageMetrics.java index 2a59e5a9d92..d67b67d04b7 100644 --- a/container-core/src/main/java/com/yahoo/metrics/StorageMetrics.java +++ b/container-core/src/main/java/com/yahoo/metrics/StorageMetrics.java @@ -75,7 +75,26 @@ public enum StorageMetrics implements VespaMetrics { VDS_MERGETHROTTLER_LOCALLYEXECUTEDMERGES_OK("vds.mergethrottler.locallyexecutedmerges.ok", Unit.INSTANCE, "The number of successful merges for 'locallyexecutedmerges'"), VDS_MERGETHROTTLER_MERGECHAINS_OK("vds.mergethrottler.mergechains.ok", Unit.INSTANCE, "The number of successful merges for 'mergechains'"), VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_BUSY("vds.mergethrottler.mergechains.failures.busy", Unit.INSTANCE, "The number of merges that failed because the storage node was busy"), - VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_TOTAL("vds.mergethrottler.mergechains.failures.total", Unit.INSTANCE, "Sum of all failures"); + VDS_MERGETHROTTLER_MERGECHAINS_FAILURES_TOTAL("vds.mergethrottler.mergechains.failures.total", Unit.INSTANCE, "Sum of all failures"), + + + // C++ TLS metrics - these come from both the distributor and storage + VDS_SERVER_NETWORK_TLS_HANDSHAKES_FAILED("vds.server.network.tls-handshakes-failed", Unit.OPERATION, "Number of client or server connection attempts that failed during TLS handshaking"), + VDS_SERVER_NETWORK_PEER_AUTHORIZATION_FAILURES("vds.server.network.peer-authorization-failures", Unit.FAILURE, "Number of TLS connection attempts failed due to bad or missing peer certificate credentials"), + VDS_SERVER_NETWORK_CLIENT_TLS_CONNECTIONS_ESTABLISHED("vds.server.network.client.tls-connections-established", Unit.CONNECTION, "Number of secure mTLS connections established"), + VDS_SERVER_NETWORK_SERVER_TLS_CONNECTIONS_ESTABLISHED("vds.server.network.server.tls-connections-established", Unit.CONNECTION, "Number of secure mTLS connections established"), + VDS_SERVER_NETWORK_CLIENT_INSECURE_CONNECTIONS_ESTABLISHED("vds.server.network.client.insecure-connections-established", Unit.CONNECTION, "Number of insecure (plaintext) connections established"), + VDS_SERVER_NETWORK_SERVER_INSECURE_CONNECTIONS_ESTABLISHED("vds.server.network.server.insecure-connections-established", Unit.CONNECTION, "Number of insecure (plaintext) connections established"), + VDS_SERVER_NETWORK_TLS_CONNECTIONS_BROKEN("vds.server.network.tls-connections-broken", Unit.CONNECTION, "Number of TLS connections broken due to failures during frame encoding or decoding"), + VDS_SERVER_NETWORK_FAILED_TLS_CONFIG_RELOADS("vds.server.network.failed-tls-config-reloads", Unit.FAILURE, "Number of times background reloading of TLS config has failed"), + + // C++ capability metrics + VDS_SERVER_NETWORK_RPC_CAPABILITY_CHECKS_FAILED("vds.server.network.rpc-capability-checks-failed", Unit.FAILURE, "Number of RPC operations that failed to due one or more missing capabilities"), + VDS_SERVER_NETWORK_STATUS_CAPABILITY_CHECKS_FAILED("vds.server.network.status-capability-checks-failed", Unit.FAILURE, "Number of status page operations that failed to due one or more missing capabilities"), + + // C++ Fnet metrics + VDS_SERVER_FNET_NUM_CONNECTIONS("vds.server.fnet.num-connections", Unit.CONNECTION, "Total number of connection objects"); + private final String name; private final Unit unit; diff --git a/container-core/src/main/java/com/yahoo/metrics/Unit.java b/container-core/src/main/java/com/yahoo/metrics/Unit.java index bb7718ddb4c..7411b5b0ca4 100644 --- a/container-core/src/main/java/com/yahoo/metrics/Unit.java +++ b/container-core/src/main/java/com/yahoo/metrics/Unit.java @@ -9,9 +9,11 @@ public enum Unit { BINARY(BaseUnit.BINARY), BUCKET(BaseUnit.BUCKET), BYTE(BaseUnit.BYTE), + BYTE_PER_SECOND(BaseUnit.BYTE, BaseUnit.SECOND), CONNECTION(BaseUnit.CONNECTION), DOCUMENT(BaseUnit.DOCUMENT), DOCUMENTID(BaseUnit.DOCUMENTID), + FAILURE(BaseUnit.FAILURE), FILE(BaseUnit.FILE), FRACTION(BaseUnit.FRACTION), HIT(BaseUnit.HIT), @@ -21,6 +23,7 @@ public enum Unit { MILLISECOND(BaseUnit.MILLISECOND), NANOSECOND(BaseUnit.NANOSECOND), NODE(BaseUnit.NODE), + PACKET(BaseUnit.PACKET), OPERATION(BaseUnit.OPERATION), OPERATION_PER_SECOND(BaseUnit.OPERATION, BaseUnit.SECOND), QUERY(BaseUnit.QUERY), @@ -28,8 +31,10 @@ public enum Unit { RECORD(BaseUnit.RECORD), REQUEST(BaseUnit.REQUEST), RESPONSE(BaseUnit.RESPONSE), + RESTART(BaseUnit.RESTART), SCORE(BaseUnit.SCORE), SECOND(BaseUnit.SECOND), + SESSION(BaseUnit.SESSION), TASK(BaseUnit.TASK), THREAD(BaseUnit.THREAD), VERSION(BaseUnit.VERSION), @@ -69,6 +74,7 @@ public enum Unit { CONNECTION("connection"), DOCUMENT("document"), DOCUMENTID("documentid"), + FAILURE("failure"), FILE("file"), FRACTION("fraction"), HIT("hit"), @@ -78,12 +84,15 @@ public enum Unit { NANOSECOND("nanosecond", "ns"), NODE("node"), OPERATION("operation"), + PACKET("packet"), QUERY("query"), RECORD("record"), REQUEST("request"), RESPONSE("response"), + RESTART("restart"), SCORE("score"), SECOND("second", "s"), + SESSION("session"), TASK("task"), THREAD("thread"), VERSION("version"), diff --git a/container-disc/src/main/java/com/yahoo/container/jdisc/metric/JrtMetrics.java b/container-disc/src/main/java/com/yahoo/container/jdisc/metric/JrtMetrics.java index ca6b41962fe..24bb862cad5 100644 --- a/container-disc/src/main/java/com/yahoo/container/jdisc/metric/JrtMetrics.java +++ b/container-disc/src/main/java/com/yahoo/container/jdisc/metric/JrtMetrics.java @@ -1,8 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.container.jdisc.metric; +// import com.yahoo.jdisc.Container; import com.yahoo.jdisc.Metric; import com.yahoo.jrt.TransportMetrics; +import com.yahoo.metrics.ContainerMetrics; import static com.yahoo.jrt.TransportMetrics.Snapshot; @@ -24,12 +26,12 @@ class JrtMetrics { void emitMetrics() { Snapshot snapshot = transportMetrics.snapshot(); Snapshot changesSincePrevious = snapshot.changesSince(previousSnapshot); - increment("jrt.transport.tls-certificate-verification-failures", changesSincePrevious.tlsCertificateVerificationFailures()); - increment("jrt.transport.peer-authorization-failures", changesSincePrevious.peerAuthorizationFailures()); - increment("jrt.transport.server.tls-connections-established", changesSincePrevious.serverTlsConnectionsEstablished()); - increment("jrt.transport.client.tls-connections-established", changesSincePrevious.clientTlsConnectionsEstablished()); - increment("jrt.transport.server.unencrypted-connections-established", changesSincePrevious.serverUnencryptedConnectionsEstablished()); - increment("jrt.transport.client.unencrypted-connections-established", changesSincePrevious.clientUnencryptedConnectionsEstablished()); + increment(ContainerMetrics.JRT_TRANSPORT_TLS_CERTIFICATE_VERIFICATION_FAILURES.baseName(), changesSincePrevious.tlsCertificateVerificationFailures()); + increment(ContainerMetrics.JRT_TRANSPORT_PEER_AUTHORIZATION_FAILURES.baseName(), changesSincePrevious.peerAuthorizationFailures()); + increment(ContainerMetrics.JRT_TRANSPORT_SERVER_TLS_CONNECIONTS_ESTABLISHED.baseName(), changesSincePrevious.serverTlsConnectionsEstablished()); + increment(ContainerMetrics.JRT_TRANSPORT_CLIENT_TLS_CONNECTIONS_ESTABLISHED.baseName(), changesSincePrevious.clientTlsConnectionsEstablished()); + increment(ContainerMetrics.JRT_TRANSPORT_CLIENT_UNENCRYPTED_CONNECTIONS_ESTABLISHED.baseName(), changesSincePrevious.serverUnencryptedConnectionsEstablished()); + increment(ContainerMetrics.JRT_TRANSPORT_CLIENT_UNENCRYPTED_CONNECTIONS_ESTABLISHED.baseName(), changesSincePrevious.clientUnencryptedConnectionsEstablished()); previousSnapshot = snapshot; } |