diff options
author | Valerij Fredriksen <valerij92@gmail.com> | 2021-04-27 17:25:13 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerij92@gmail.com> | 2021-04-29 21:07:37 +0200 |
commit | f4b248296b9322bbd59f8802ffe9f5d5b56c5ef0 (patch) | |
tree | c5ae2eaf270e818848fe2fef6eb88f29df45bbc5 /configserver | |
parent | c878b148c9bf13cd7e6475217a68d8f47df5df88 (diff) |
Expose content resource usage metrics from cluster-controller
Diffstat (limited to 'configserver')
5 files changed, 112 insertions, 24 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/http/v2/DeploymentMetricsResponse.java b/configserver/src/main/java/com/yahoo/vespa/config/server/http/v2/DeploymentMetricsResponse.java index 8244a486f1c..062a21b1f80 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/http/v2/DeploymentMetricsResponse.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/http/v2/DeploymentMetricsResponse.java @@ -32,7 +32,14 @@ public class DeploymentMetricsResponse extends SlimeJsonResponse { aggregator.aggregateDocumentCount().ifPresent(documentCount -> metrics.setDouble("documentCount", documentCount)); aggregator.aggregateQueryLatency().ifPresent(queryLatency -> metrics.setDouble("queryLatency",queryLatency)); aggregator.aggregateFeedLatency().ifPresent(feedLatency -> metrics.setDouble("feedLatency", feedLatency)); - aggregator.feedingBlocked().ifPresent(feedingBlocked -> metrics.setDouble("feedingBlocked", feedingBlocked)); + aggregator.memoryUsage().ifPresent(memory -> { + metrics.setDouble("memoryUtil", memory.util()); + metrics.setDouble("memoryFeedBlockLimit", memory.feedBlockLimit()); + }); + aggregator.diskUsage().ifPresent(disk -> { + metrics.setDouble("diskUtil", disk.util()); + metrics.setDouble("diskFeedBlockLimit", disk.feedBlockLimit()); + }); } } } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java index 2903f0fadcc..0c8b33b9002 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetriever.java @@ -46,8 +46,8 @@ public class ClusterDeploymentMetricsRetriever { private static final String VESPA_CONTAINER = "vespa.container"; private static final String VESPA_QRSERVER = "vespa.qrserver"; private static final String VESPA_DISTRIBUTOR = "vespa.distributor"; - private static final String VESPA_SEARCHNODE = "vespa.searchnode"; - private static final List<String> WANTED_METRIC_SERVICES = List.of(VESPA_CONTAINER, VESPA_QRSERVER, VESPA_DISTRIBUTOR, VESPA_SEARCHNODE); + private static final String VESPA_CONTAINER_CLUSTERCONTROLLER = "vespa.container-clustercontroller"; + private static final List<String> WANTED_METRIC_SERVICES = List.of(VESPA_CONTAINER, VESPA_QRSERVER, VESPA_DISTRIBUTOR, VESPA_CONTAINER_CLUSTERCONTROLLER); private static final ExecutorService executor = Executors.newFixedThreadPool(10, new DaemonThreadFactory("cluster-deployment-metrics-retriever-")); @@ -138,8 +138,12 @@ public class ClusterDeploymentMetricsRetriever { case VESPA_DISTRIBUTOR: deploymentMetricsAggregator.addDocumentCount(values.field("vds.distributor.docsstored.average").asDouble()); break; - case VESPA_SEARCHNODE: - deploymentMetricsAggregator.addFeedingBlocked((int) values.field("content.proton.resource_usage.feeding_blocked.last").asLong()); + case VESPA_CONTAINER_CLUSTERCONTROLLER: + deploymentMetricsAggregator + .addMemoryUsage(values.field("cluster-controller.resource_usage.max_memory_utilization.last").asDouble(), + values.field("cluster-controller.resource_usage.memory_limit.last").asDouble()) + .addDiskUsage(values.field("cluster-controller.resource_usage.max_disk_utilization.last").asDouble(), + values.field("cluster-controller.resource_usage.disk_limit.last").asDouble()); break; } } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/DeploymentMetricsAggregator.java b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/DeploymentMetricsAggregator.java index 916f5ff5613..f27cf942dd8 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/DeploymentMetricsAggregator.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/DeploymentMetricsAggregator.java @@ -13,7 +13,8 @@ public class DeploymentMetricsAggregator { private LatencyMetrics qr; private LatencyMetrics container; private Double documentCount; - private Integer feedingBlocked; + private ResourceUsage memoryUsage; + private ResourceUsage diskUsage; public synchronized DeploymentMetricsAggregator addFeedLatency(double sum, double count) { this.feed = combineLatency(this.feed, sum, count); @@ -35,50 +36,87 @@ public class DeploymentMetricsAggregator { return this; } - public synchronized DeploymentMetricsAggregator addFeedingBlocked(int feedingBlocked) { - this.feedingBlocked = Math.max(Optional.ofNullable(this.feedingBlocked).orElse(0), feedingBlocked); + public synchronized DeploymentMetricsAggregator addDiskUsage(double feedBlockUtil, double feedBlockLimit) { + this.diskUsage = combineResourceUtil(this.diskUsage, feedBlockUtil, feedBlockLimit); + return this; + } + + public synchronized DeploymentMetricsAggregator addMemoryUsage(double feedBlockUtil, double feedBlockLimit) { + this.memoryUsage = combineResourceUtil(this.memoryUsage, feedBlockUtil, feedBlockLimit); return this; } public Optional<Double> aggregateFeedLatency() { - return Optional.ofNullable(feed).map(m -> m.latencySum / m.latencyCount).filter(num -> !num.isNaN()); + return Optional.ofNullable(feed).map(m -> m.sum / m.count).filter(num -> !num.isNaN()); } public Optional<Double> aggregateFeedRate() { - return Optional.ofNullable(feed).map(m -> m.latencyCount / 60); + return Optional.ofNullable(feed).map(m -> m.count / 60); } public Optional<Double> aggregateQueryLatency() { if (container == null && qr == null) return Optional.empty(); var c = Optional.ofNullable(container).orElseGet(LatencyMetrics::new); var q = Optional.ofNullable(qr).orElseGet(LatencyMetrics::new); - return Optional.of((c.latencySum + q.latencySum) / (c.latencyCount + q.latencyCount)).filter(num -> !num.isNaN()); + return Optional.of((c.sum + q.sum) / (c.count + q.count)).filter(num -> !num.isNaN()); } public Optional<Double> aggregateQueryRate() { if (container == null && qr == null) return Optional.empty(); var c = Optional.ofNullable(container).orElseGet(LatencyMetrics::new); var q = Optional.ofNullable(qr).orElseGet(LatencyMetrics::new); - return Optional.of((c.latencyCount + q.latencyCount) / 60); + return Optional.of((c.count + q.count) / 60); } public Optional<Double> aggregateDocumentCount() { return Optional.ofNullable(documentCount); } - public Optional<Integer> feedingBlocked() { - return Optional.ofNullable(feedingBlocked); + public Optional<ResourceUsage> memoryUsage() { + return Optional.ofNullable(memoryUsage); } - private LatencyMetrics combineLatency(LatencyMetrics metricsOrNull, double sum, double count) { - var metrics = Optional.ofNullable(metricsOrNull).orElseGet(LatencyMetrics::new); - metrics.latencyCount += count; - metrics.latencySum += sum; - return metrics; + public Optional<ResourceUsage> diskUsage() { + return Optional.ofNullable(diskUsage); + } + + + private static LatencyMetrics combineLatency(LatencyMetrics metricsOrNull, double sum, double count) { + return Optional.ofNullable(metricsOrNull).orElseGet(LatencyMetrics::new).combine(sum, count); + } + + private static ResourceUsage combineResourceUtil(ResourceUsage resourceUsageOrNull, double util, double limit) { + return Optional.ofNullable(resourceUsageOrNull).orElseGet(ResourceUsage::new).combine(util, limit); } private static class LatencyMetrics { - double latencySum; - double latencyCount; + private double sum; + private double count; + + private LatencyMetrics combine(double sum, double count) { + this.sum += sum; + this.count += count; + return this; + } + } + + public static class ResourceUsage { + /** + * Current resource utilization relative to feed block limit, i.e. value of >= 1 means utilization at or above + * feed block limit. + */ + private double feedBlockUtil; + + /** Resource utilization limit at which further external feed is blocked */ + private double feedBlockLimit; + + private ResourceUsage combine(double feedBlockUtil, double feedBlockLimit) { + if (feedBlockUtil > this.feedBlockUtil) this.feedBlockUtil = feedBlockUtil; + if (feedBlockLimit > this.feedBlockLimit) this.feedBlockLimit = feedBlockLimit; + return this; + } + + public double util() { return feedBlockUtil * feedBlockLimit; } + public double feedBlockLimit() { return feedBlockLimit; } } } diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetrieverTest.java b/configserver/src/test/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetrieverTest.java index 5aa3e196222..a5a46b67d13 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetrieverTest.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/metrics/ClusterDeploymentMetricsRetrieverTest.java @@ -35,7 +35,7 @@ public class ClusterDeploymentMetricsRetrieverTest { @Test public void testMetricAggregation() throws IOException { - List<URI> hosts = Stream.of(1, 2, 3) + List<URI> hosts = Stream.of(1, 2, 3, 4) .map(item -> URI.create("http://localhost:" + wireMock.port() + "/" + item)) .collect(Collectors.toList()); @@ -54,13 +54,21 @@ public class ClusterDeploymentMetricsRetrieverTest { .withStatus(200) .withBody(containerMetrics()))); + stubFor(get(urlEqualTo("/4")) + .willReturn(aResponse() + .withStatus(200) + .withBody(clustercontrollerMetrics()))); + ClusterInfo expectedContentCluster = new ClusterInfo("content_cluster_id", "content"); ClusterInfo expectedContainerCluster = new ClusterInfo("container_cluster_id", "container"); Map<ClusterInfo, DeploymentMetricsAggregator> aggregatorMap = new ClusterDeploymentMetricsRetriever().requestMetricsGroupedByCluster(hosts); compareAggregators( - new DeploymentMetricsAggregator().addDocumentCount(6000.0).addFeedingBlocked(0), + new DeploymentMetricsAggregator() + .addDocumentCount(6000.0) + .addMemoryUsage(0.89074, 0.8) + .addDiskUsage(0.83517, 0.75), aggregatorMap.get(expectedContentCluster) ); @@ -84,6 +92,10 @@ public class ClusterDeploymentMetricsRetrieverTest { return Files.readString(Path.of("src/test/resources/metrics/content_metrics.json")); } + private String clustercontrollerMetrics() throws IOException { + return Files.readString(Path.of("src/test/resources/metrics/clustercontroller_metrics.json")); + } + // Same tolerance value as used internally in MetricsAggregator.isZero private static final double metricsTolerance = 0.001; @@ -95,7 +107,10 @@ public class ClusterDeploymentMetricsRetrieverTest { compareOptionals(expected.aggregateFeedRate(), actual.aggregateFeedRate(), assertDoubles); compareOptionals(expected.aggregateQueryLatency(), actual.aggregateQueryLatency(), assertDoubles); compareOptionals(expected.aggregateFeedLatency(), actual.aggregateFeedLatency(), assertDoubles); - assertEquals(expected.feedingBlocked(), actual.feedingBlocked()); + compareOptionals(expected.diskUsage(), actual.diskUsage(), (a, b) -> assertDoubles.accept(a.util(), b.util())); + compareOptionals(expected.diskUsage(), actual.diskUsage(), (a, b) -> assertDoubles.accept(a.feedBlockLimit(), b.feedBlockLimit())); + compareOptionals(expected.memoryUsage(), actual.memoryUsage(), (a, b) -> assertDoubles.accept(a.util(), b.util())); + compareOptionals(expected.memoryUsage(), actual.memoryUsage(), (a, b) -> assertDoubles.accept(a.feedBlockLimit(), b.feedBlockLimit())); } @SuppressWarnings("OptionalUsedAsFieldOrParameterType") diff --git a/configserver/src/test/resources/metrics/clustercontroller_metrics.json b/configserver/src/test/resources/metrics/clustercontroller_metrics.json new file mode 100644 index 00000000000..f487f95802c --- /dev/null +++ b/configserver/src/test/resources/metrics/clustercontroller_metrics.json @@ -0,0 +1,24 @@ +{ + "services": [ + { + "name": "vespa.container-clustercontroller", + "timestamp": 1619529109, + "metrics": [ + { + "values": { + "cluster-controller.resource_usage.disk_limit.last": 0.75, + "cluster-controller.resource_usage.nodes_above_limit.last": 0, + "cluster-controller.resource_usage.max_memory_utilization.last": 0.8907474348626, + "cluster-controller.resource_usage.max_disk_utilization.last": 0.8351705494609, + "cluster-controller.cluster-state-change.count": 2, + "cluster-controller.resource_usage.memory_limit.last": 0.8 + }, + "dimensions": { + "clustertype": "content", + "clusterid": "content_cluster_id" + } + } + ] + } + ] +} |