diff options
author | yngveaasheim <yngve@yahooinc.com> | 2023-05-24 16:43:39 +0200 |
---|---|---|
committer | yngveaasheim <yngve@yahooinc.com> | 2023-05-24 16:43:59 +0200 |
commit | a10afb93fb088d406f37775fa738c1cf9d6469e8 (patch) | |
tree | cfd6f23f2197ced788e1147e77aca5468cfcc4c1 /metrics | |
parent | c188bfa3d033adbb48f444ebbbe825037d9de8c8 (diff) |
Use enums for node repo metrics
Diffstat (limited to 'metrics')
-rw-r--r-- | metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java | 60 | ||||
-rw-r--r-- | metrics/src/main/java/ai/vespa/metrics/Unit.java | 9 |
2 files changed, 68 insertions, 1 deletions
diff --git a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java index d323026e4ca..1c61b65f77b 100644 --- a/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java +++ b/metrics/src/main/java/ai/vespa/metrics/ConfigServerMetrics.java @@ -47,10 +47,68 @@ public enum ConfigServerMetrics implements VespaMetrics { ORCHESTRATOR_LOCK_HOLD_LATENCY("orchestrator.lock.hold-latency", Unit.SECOND, "Time zookeeper lock was held before it was released"), // Node repository metrics + NODES_ACTIVE("nodes.active", Unit.NODE, "The number of active nodes in a cluster"), + NODES_NON_ACTIVE("nodes.nonActive", Unit.NODE, "The number of non-active nodes in a cluster"), + NODES_NON_ACTIVE_FRACTION("nodes.nonActiveFraction", Unit.NODE, "The fraction of non-active nodes vs total nodes in a cluster"), + NODES_EXCLUSIVE_SWITCH_FRACTION("nodes.exclusiveSwitchFraction", Unit.FRACTION, "The fraction of nodes in a cluster on exclusive network switches"), + CLUSTER_COST("cluster.cost", Unit.DOLLAR_PER_HOUR, "The cost of the nodes allocated to a certain cluster, in $/hr"), CLUSTER_LOAD_IDEAL_CPU("cluster.load.ideal.cpu", Unit.FRACTION, "The ideal cpu load of a certain cluster"), CLUSTER_LOAD_IDEAL_MEMORY("cluster.load.ideal.memory", Unit.FRACTION, "The ideal memory load of a certain cluster"), - CLUSTER_LOAD_IDEAL_DISK("cluster.load.ideal.disk", Unit.FRACTION, "The ideal disk load of a certain cluster"); + CLUSTER_LOAD_IDEAL_DISK("cluster.load.ideal.disk", Unit.FRACTION, "The ideal disk load of a certain cluster"), + + ZONE_WORKING("zone.working", Unit.BINARY, "The value 1 if zone is considered healthy, 0 if not. This is decided by considering the number of non-active nodes vs the number of active nodes in a zone"), + CACHE_NODE_OBJECT_HIT_RATE("cache.nodeObject.hitRate", Unit.FRACTION, "The fraction of cache hits vs cache lookups for the node object cache"), + CACHE_NODE_OBJECT_EVICTION_COUNT("cache.nodeObject.evictionCount", Unit.ITEM, "The number of cache elements evicted from the node object cache"), + CACHE_NODE_OBJECT_SIZE("cache.nodeObject.size", Unit.ITEM, "The number of cache elements in the node object cache"), + CACHE_CURATOR_HIT_RATE("cache.curator.hitRate", Unit.FRACTION, "The fraction of cache hits vs cache lookups for the curator cache"), + CACHE_CURATOR_EVICTION_COUNT("cache.curator.evictionCount", Unit.ITEM, "The number of cache elements evicted from the curator cache"), + CACHE_CURATOR_SIZE("cache.curator.size", Unit.ITEM, "The number of cache elements in the curator cache"), + WANTED_RESTART_GENERATION("wantedRestartGeneration", Unit.GENERATION, "Wanted restart generation for tenant node"), + CURRENT_RESTART_GENERATION("currentRestartGeneration", Unit.GENERATION, "Current restart generation for tenant node"), + WANT_TO_RESTART("wantToRestart", Unit.BINARY, "One if node wants to restart, zero if not"), + WANTED_REBOOT_GENERATION("wantedRebootGeneration", Unit.GENERATION, "Wanted reboot generation for tenant node"), + CURRENT_REBOOT_GENERATION("currentRebootGeneration", Unit.GENERATION, "Current reboot generation for tenant node"), + WANT_TO_REBOOT("wantToReboot", Unit.BINARY, "One if node wants to reboot, zero if not"), + RETIRED("retired", Unit.BINARY, "One if node is retired, zero if not"), + WANTED_VESPA_VERSION("wantedVespaVersion", Unit.VERSION, "Wanted vespa version for the node, in the form <MINOR.PATCH>. Major version is not included here"), + CURRENT_VESPA_VERSION("currentVespaVersion", Unit.VERSION, "Current vespa version for the node, in the form <MINOR.PATCH>. Major version is not included here"), + WANT_TO_CHANGE_VESPA_VERSION("wantToChangeVespaVersion", Unit.BINARY, "One if node want to change Vespa version, zero if not"), + HAS_WIRE_GUARD_KEY("hasWireguardKey", Unit.BINARY, "One if node has a WireGuard key, zero if not"), + WANT_TO_RETIRE("wantToRetire", Unit.BINARY, "One if node wants to retire, zero if not"), + WANT_TO_DEPROVISION("wantToDeprovision", Unit.BINARY, "One if node wants to be deprovisioned, zero if not"), + FAIL_REPORT("failReport", Unit.BINARY, "One if there is a fail report for the node, zero if not"), + SUSPENDED("suspended", Unit.BINARY, "One if the node is suspended, zero if not"), + SUSPENDED_SECONDS("suspendedSeconds", Unit.SECOND, "The number of seconds the node has been suspended"), + NUMBER_OF_SERVICES_UP("numberOfServicesUp", Unit.INSTANCE, "The number of services confirmed to be running on a node"), + NUMBER_OF_SERVICES_NOT_CHECKED("numberOfServicesNotChecked", Unit.INSTANCE, "The number of services supposed to run on a node, that has not checked"), + NUMBER_OF_SERVICES_DOWN("numberOfServicesDown", Unit.INSTANCE, "The number of services confirmed to not be running on a node"), + SOME_SERVICES_DOWN("someServicesDown", Unit.BINARY, "One if one or more services has been confirmed to not run on a node, zero if not"), + NUMBER_OF_SERVICES_UNKNOWN("numberOfServicesUnknown", Unit.INSTANCE, "The number of services the config server does not know if is running on a node"), + NODE_FAILER_BAD_NODE("nodeFailerBadNode", Unit.BINARY, "One if the node is failed due to being bad, zero if not"), + DOWN_IN_NODE_REPO("downInNodeRepo", Unit.BINARY, "One if the node is registered as being down in the node repository, zero if not"), + NUMBER_OF_SERVICES("numberOfServices", Unit.INSTANCE, "Number of services supposed to run on a node"), + LOCK_ATTEMPT_ACQUIRE_MAX_ACTIVE_LATENCY("lockAttempt.acquireMaxActiveLatency", Unit.SECOND, "Maximum duration for keeping a lock, ending during the metrics snapshot, or still being kept at the end or this snapshot period"), + LOCK_ATTEMPT_ACQUIRE_HZ("lockAttempt.acquireHz", Unit.OPERATION_PER_SECOND, "Average number of locks acquired per second the snapshot period"), + LOCK_ATTEMPT_ACQUIRE_LOAD("lockAttempt.acquireLoad", Unit.OPERATION, "Average number of locks held concurrently during the snapshot period"), + LOCK_ATTEMPT_LOCKED_LATENCY("lockAttempt.lockedLatency", Unit.SECOND, "Longest lock duration in the snapshot period"), + LOCK_ATTEMPT_LOCKED_LOAD("lockAttempt.lockedLoad", Unit.OPERATION, "Average number of locks held concurrently during the snapshot period"), + LOCK_ATTEMPT_ACQUIRE_TIMED_OUT("lockAttempt.acquireTimedOut", Unit.OPERATION, " Number of locking attempts that timed out during the snapshot period"), + LOCK_ATTEMPT_DEADLOCK("lockAttempt.deadlock", Unit.OPERATION, "Number of lock grab deadlocks detected during the snapshont period"), + LOCK_ATTEMPT_ERRORS("lockAttempt.errors", Unit.OPERATION, "Number of other lock related errors detected during the snapshont period"), + + HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_CPU("hostedVespa.docker.totalCapacityCpu", Unit.VCPU, "Total number of VCPUs on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_MEM("hostedVespa.docker.totalCapacityMem", Unit.GIGABYTE, "Total amount of memory on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_DOCKER_TOTAL_CAPACITY_DISK("hostedVespa.docker.totalCapacityDisk", Unit.GIGABYTE, "Total amount of disk space on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_DOCKER_FREE_CAPACITY_CPU("hostedVespa.docker.freeCapacityCpu", Unit.VCPU, "Total number of free VCPUs on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_DOCKER_FREE_CAPACITY_MEM("hostedVespa.docker.freeCapacityMem", Unit.GIGABYTE, "Total amount of free memory on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_DOCKER_FREE_CAPACITY_DISK("hostedVespa.docker.freeCapacityDisk", Unit.GIGABYTE, "Total amount of free disk space on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_CPU("hostedVespa.docker.allocatedCapacityCpu", Unit.VCPU, "Total number of allocated VCPUs on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_MEM("hostedVespa.docker.allocatedCapacityMem", Unit.GIGABYTE, "Total amount of allocated memory on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_DOCKER_ALLOCATED_CAPACITY_DISK("hostedVespa.docker.allocatedCapacityDisk", Unit.GIGABYTE, "Total amount of allocated disk space on tenant hosts managed by hosted Vespa in a zone"), + HOSTED_VESPA_BREAKFIXED_HOSTS("hostedVespa.breakfixedHosts", Unit.HOST, "Number of hosts managed that are breakfixed in a zone"), + HOSTED_VESPA_PENDING_REDEPLOYMENTS("hostedVespa.pendingRedeployments", Unit.TASK, "The number of hosted Vespa re-deployments pending"), + HOSTED_VESPA_DOCKER_SKEW("hostedVespa.docker.skew", Unit.FRACTION, "A number in the range 0..1 indicating how well allocated resources are balanced with availability on hosts"); private final String name; private final Unit unit; diff --git a/metrics/src/main/java/ai/vespa/metrics/Unit.java b/metrics/src/main/java/ai/vespa/metrics/Unit.java index ee6ea569fc4..d514b9e9839 100644 --- a/metrics/src/main/java/ai/vespa/metrics/Unit.java +++ b/metrics/src/main/java/ai/vespa/metrics/Unit.java @@ -16,8 +16,11 @@ public enum Unit { FAILURE(BaseUnit.FAILURE, "Failures, typically for requests, operations or nodes"), FILE(BaseUnit.FILE, "Data file stored on the disk on a node"), FRACTION(BaseUnit.FRACTION, "A value in the range [0..1]. Higher values can occur for some metrics, but would indicate the value is outside of the allowed range."), + GENERATION(BaseUnit.GENERATION,"Typically generation of configuration or application package"), + GIGABYTE(BaseUnit.GIGABYTE,"One billion bytes"), HIT(BaseUnit.HIT, "Document that meets the filtering/restriction criteria specified by a given query"), HIT_PER_QUERY(BaseUnit.HIT, BaseUnit.QUERY, "Number of hits per query over a period of time"), + HOST(BaseUnit.HOST, "Bare metal computer that contain nodes"), INSTANCE(BaseUnit.INSTANCE, "Typically tenant or application"), ITEM(BaseUnit.ITEM, "Object or unit maintained in e.g. a queue"), MILLISECOND(BaseUnit.MILLISECOND, "Millisecond, 1/1000 of a second"), @@ -38,6 +41,8 @@ public enum Unit { SESSION(BaseUnit.SESSION, "A set of operations taking place during one connection or as part of a higher level operation"), TASK(BaseUnit.TASK, "Piece of work executed by a server, e.g. to perform back-ground data maintenance"), THREAD(BaseUnit.THREAD, "Computer thread for executing e.g. tasks, operations or queries"), + VCPU(BaseUnit.VCPU,"Virtual CPU"), + VERSION(BaseUnit.VERSION, "Software or config version"), WAKEUP(BaseUnit.WAKEUP, "Computer thread wake-ups for doing some work"); @@ -85,7 +90,10 @@ public enum Unit { FAILURE("failure"), FILE("file"), FRACTION("fraction"), + GENERATION("generation"), + GIGABYTE("gigabyte"), HIT("hit"), + HOST("host"), HOUR("hour"), INSTANCE("instance"), ITEM("item"), @@ -105,6 +113,7 @@ public enum Unit { SESSION("session"), TASK("task"), THREAD("thread"), + VCPU("vcpu"), VERSION("version"), WAKEUP("wakeup"); |