diff options
6 files changed, 72 insertions, 45 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java index 0e11bcdccaf..ee74aca0e14 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java @@ -23,11 +23,13 @@ public class ClusterMetrics { private final String clusterId; private final String clusterType; private final Map<String, Double> metrics; + private final Map<String, Double> reindexingProgress; - public ClusterMetrics(String clusterId, String clusterType, Map<String, Double> metrics) { + public ClusterMetrics(String clusterId, String clusterType, Map<String, Double> metrics, Map<String, Double> reindexingProgress) { this.clusterId = clusterId; this.clusterType = clusterType; this.metrics = Map.copyOf(metrics); + this.reindexingProgress = Map.copyOf(reindexingProgress); } public String getClusterId() { @@ -74,4 +76,7 @@ public class ClusterMetrics { return Optional.ofNullable(metrics.get(DISK_FEED_BLOCK_LIMIT)); } + public Map<String, Double> reindexingProgress() { + return reindexingProgress; + } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java index 20154c4f122..ba4aaf92fc8 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java @@ -69,7 +69,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer { lockedInstance -> lockedInstance.with(existingDeployment.zone(), newMetrics) .recordActivityAt(now, existingDeployment.zone()))); - controller().notificationsDb().setDeploymentFeedingBlockedNotifications(deploymentId, clusterMetrics); + controller().notificationsDb().setDeploymentMetricsNotifications(deploymentId, clusterMetrics); }); } catch (Exception e) { failures.incrementAndGet(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java index beb771b1aa2..7c2d990750c 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java @@ -124,31 +124,22 @@ public class NotificationsDb { } /** - * Updates feeding blocked notifications for the given deployment based on current cluster metrics. - * Will clear notifications of any cluster not reporting the metrics or whose metrics indicate feed is not blocked, - * while setting notifications for cluster that are (Level.error) or are nearly (Level.warning) feed blocked. + * Updates notifications based on deployment metrics (e.g. feed blocked and reindexing progress) for the given + * deployment based on current cluster metrics. + * Will clear notifications of any cluster not reporting the metrics or whose metrics indicate feed is not blocked + * or reindexing no longer in progress. Will set notification for clusters: + * - that are (Level.error) or are nearly (Level.warning) feed blocked, + * - that are (Level.info) currently reindexing at least 1 document type. */ - public void setDeploymentFeedingBlockedNotifications(DeploymentId deploymentId, List<ClusterMetrics> clusterMetrics) { + public void setDeploymentMetricsNotifications(DeploymentId deploymentId, List<ClusterMetrics> clusterMetrics) { Instant now = clock.instant(); - List<Notification> feedBlockNotifications = clusterMetrics.stream() + List<Notification> newNotifications = clusterMetrics.stream() .flatMap(metric -> { - Optional<Pair<Level, String>> memoryStatus = - resourceUtilToFeedBlockStatus("memory", metric.memoryUtil(), metric.memoryFeedBlockLimit()); - Optional<Pair<Level, String>> diskStatus = - resourceUtilToFeedBlockStatus("disk", metric.diskUtil(), metric.diskFeedBlockLimit()); - if (memoryStatus.isEmpty() && diskStatus.isEmpty()) return Stream.empty(); - - // Find the max among levels - Level level = Stream.of(memoryStatus, diskStatus) - .flatMap(status -> status.stream().map(Pair::getFirst)) - .max(Comparator.comparing(Enum::ordinal)).get(); - List<String> messages = Stream.concat(memoryStatus.stream(), diskStatus.stream()) - .filter(status -> status.getFirst() == level) // Do not mix message from different levels - .map(Pair::getSecond) - .collect(Collectors.toUnmodifiableList()); NotificationSource source = NotificationSource.from(deploymentId, ClusterSpec.Id.from(metric.getClusterId())); - return Stream.of(new Notification(now, Type.feedBlock, level, source, messages)); + return Stream.of(createFeedBlockNotification(source, now, metric), + createReindexNotification(source, now, metric)); }) + .flatMap(Optional::stream) .collect(Collectors.toUnmodifiableList()); NotificationSource deploymentSource = NotificationSource.from(deploymentId); @@ -157,10 +148,11 @@ public class NotificationsDb { List<Notification> updated = Stream.concat( initial.stream() .filter(notification -> - // Filter out old feed block notifications for this deployment - notification.type() != Type.feedBlock || !deploymentSource.contains(notification.source())), + // Filter out old feed block notifications and reindex for this deployment + (notification.type() != Type.feedBlock && notification.type() != Type.reindex) || + !deploymentSource.contains(notification.source())), // ... and add the new notifications for this deployment - feedBlockNotifications.stream()) + newNotifications.stream()) .collect(Collectors.toUnmodifiableList()); if (!initial.equals(updated)) @@ -168,6 +160,33 @@ public class NotificationsDb { } } + private static Optional<Notification> createFeedBlockNotification(NotificationSource source, Instant at, ClusterMetrics metric) { + Optional<Pair<Level, String>> memoryStatus = + resourceUtilToFeedBlockStatus("memory", metric.memoryUtil(), metric.memoryFeedBlockLimit()); + Optional<Pair<Level, String>> diskStatus = + resourceUtilToFeedBlockStatus("disk", metric.diskUtil(), metric.diskFeedBlockLimit()); + if (memoryStatus.isEmpty() && diskStatus.isEmpty()) return Optional.empty(); + + // Find the max among levels + Level level = Stream.of(memoryStatus, diskStatus) + .flatMap(status -> status.stream().map(Pair::getFirst)) + .max(Comparator.comparing(Enum::ordinal)).get(); + List<String> messages = Stream.concat(memoryStatus.stream(), diskStatus.stream()) + .filter(status -> status.getFirst() == level) // Do not mix message from different levels + .map(Pair::getSecond) + .collect(Collectors.toUnmodifiableList()); + return Optional.of(new Notification(at, Type.feedBlock, level, source, messages)); + } + + private static Optional<Notification> createReindexNotification(NotificationSource source, Instant at, ClusterMetrics metric) { + if (metric.reindexingProgress().isEmpty()) return Optional.empty(); + List<String> messages = metric.reindexingProgress().entrySet().stream() + .map(entry -> String.format("document type '%s' (%.1f%% done)", entry.getKey(), 100 * entry.getValue())) + .sorted() + .collect(Collectors.toUnmodifiableList()); + return Optional.of(new Notification(at, Type.reindex, Level.info, source, messages)); + } + /** * Returns a feed block summary for the given resource: the notification level and * notification message for the given resource utilization wrt. given resource limit. diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java index 59fb5b596f1..c45aaa563e1 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java @@ -118,8 +118,8 @@ public class DeploymentMetricsMaintainerTest { @Test public void cluster_metric_aggregation_test() { List<ClusterMetrics> clusterMetrics = List.of( - new ClusterMetrics("niceCluster", "container", Map.of("queriesPerSecond", 23.0, "queryLatency", 1337.0)), - new ClusterMetrics("alsoNiceCluster", "container", Map.of("queriesPerSecond", 11.0, "queryLatency", 12.0))); + new ClusterMetrics("niceCluster", "container", Map.of("queriesPerSecond", 23.0, "queryLatency", 1337.0), Map.of()), + new ClusterMetrics("alsoNiceCluster", "container", Map.of("queriesPerSecond", 11.0, "queryLatency", 12.0), Map.of())); DeploymentMetrics deploymentMetrics = DeploymentMetricsMaintainer.updateDeploymentMetrics(DeploymentMetrics.none, clusterMetrics); @@ -131,7 +131,7 @@ public class DeploymentMetricsMaintainerTest { } private void setMetrics(ApplicationId application, Map<String, Double> metrics) { - var clusterMetrics = new ClusterMetrics("default", "container", metrics); + var clusterMetrics = new ClusterMetrics("default", "container", metrics, Map.of()); tester.controllerTester().serviceRegistry().configServerMock().setMetrics(new DeploymentId(application, ZoneId.from("dev", "us-east-1")), clusterMetrics); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java index 7b4882de3ff..29d77c38b1a 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java @@ -16,7 +16,6 @@ import java.time.Duration; import java.util.Map; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; /** * Tests the traffic fraction updater. This also tests its dependency on DeploymentMetricsMaintainer. @@ -82,7 +81,7 @@ public class TrafficShareUpdaterTest { } private void setQpsMetric(double qps, ApplicationId application, ZoneId zone, DeploymentTester tester) { - var clusterMetrics = new ClusterMetrics("default", "container", Map.of(ClusterMetrics.QUERIES_PER_SECOND, qps)); + var clusterMetrics = new ClusterMetrics("default", "container", Map.of(ClusterMetrics.QUERIES_PER_SECOND, qps), Map.of()); tester.controllerTester().serviceRegistry().configServerMock().setMetrics(new DeploymentId(application, zone), clusterMetrics); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java index f6c17ffcef1..454a4f81524 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java @@ -107,54 +107,56 @@ public class NotificationsDbTest { List<Notification> expected = new ArrayList<>(notifications); // No metrics, no new notification - notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of()); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of()); assertEquals(expected, curatorDb.readNotifications(tenant)); // Metrics that contain none of the feed block metrics does not create new notification - notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null, Map.of()))); assertEquals(expected, curatorDb.readNotifications(tenant)); // Metrics that only contain util or limit (should not be possible) should not cause any issues - notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, null, null, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, null, null, 0.5, Map.of()))); assertEquals(expected, curatorDb.readNotifications(tenant)); // One resource is at warning - notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5, Map.of()))); expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 85.0%, feed block limit: 90.0%)")); assertEquals(expected, curatorDb.readNotifications(tenant)); // Both resources over the limit - notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5, Map.of()))); expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1, "disk (usage: 95.0%, feed block limit: 90.0%)")); assertEquals(expected, curatorDb.readNotifications(tenant)); // One resource at warning, one at error: Only show error message - notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.7, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.7, 0.5, Map.of()))); expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1, "memory (usage: 70.0%, feed block limit: 50.0%)", "disk (usage: 95.0%, feed block limit: 90.0%)")); assertEquals(expected, curatorDb.readNotifications(tenant)); } @Test - public void feed_blocked_multiple_cluster_test() { + public void deployment_metrics_multiple_cluster_test() { DeploymentId deploymentId = new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("prod", "us-south-3")); NotificationSource sourceCluster1 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster1")); NotificationSource sourceCluster2 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster2")); NotificationSource sourceCluster3 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster3")); List<Notification> expected = new ArrayList<>(notifications); - // Cluster1 and cluster2 are having issues - notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of( - clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.1, 0.8, 0.2, 0.9))); + // Cluster1 and cluster2 are having feed block issues, cluster 3 is reindexing + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of( + clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5, Map.of()), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75, Map.of()), clusterMetrics("cluster3", 0.1, 0.8, 0.2, 0.9, Map.of("announcements", 0.75, "build", 0.5)))); expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 85.0%, feed block limit: 90.0%)")); expected.add(notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)")); + expected.add(notification(12345, Type.reindex, Level.info, sourceCluster3, "document type 'announcements' (75.0% done)", "document type 'build' (50.0% done)")); assertEquals(expected, curatorDb.readNotifications(tenant)); - // Cluster1 improves, while cluster3 starts having issues - notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of( - clusterMetrics("cluster1", 0.15, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.75, 0.8, 0.2, 0.9))); + // Cluster1 improves, while cluster3 starts having feed block issues and finishes reindexing 'build' documents + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of( + clusterMetrics("cluster1", 0.15, 0.9, 0.3, 0.5, Map.of()), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75, Map.of()), clusterMetrics("cluster3", 0.75, 0.8, 0.2, 0.9, Map.of("announcements", 0.9)))); expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)")); expected.set(7, notification(12345, Type.feedBlock, Level.warning, sourceCluster3, "disk (usage: 75.0%, feed block limit: 80.0%)")); + expected.set(8, notification(12345, Type.reindex, Level.info, sourceCluster3, "document type 'announcements' (90.0% done)")); assertEquals(expected, curatorDb.readNotifications(tenant)); } @@ -206,12 +208,14 @@ public class NotificationsDbTest { return new Notification(Instant.ofEpochSecond(secondsSinceEpoch), type, level, source, List.of(messages)); } - private static ClusterMetrics clusterMetrics(String clusterId, Double diskUtil, Double diskLimit, Double memoryUtil, Double memoryLimit) { + private static ClusterMetrics clusterMetrics(String clusterId, + Double diskUtil, Double diskLimit, Double memoryUtil, Double memoryLimit, + Map<String, Double> reindexingProgress) { Map<String, Double> metrics = new HashMap<>(); if (diskUtil != null) metrics.put(ClusterMetrics.DISK_UTIL, diskUtil); if (diskLimit != null) metrics.put(ClusterMetrics.DISK_FEED_BLOCK_LIMIT, diskLimit); if (memoryUtil != null) metrics.put(ClusterMetrics.MEMORY_UTIL, memoryUtil); if (memoryLimit != null) metrics.put(ClusterMetrics.MEMORY_FEED_BLOCK_LIMIT, memoryLimit); - return new ClusterMetrics(clusterId, "content", metrics); + return new ClusterMetrics(clusterId, "content", metrics, reindexingProgress); } } |