diff options
author | Valerij Fredriksen <valerijf@yahooinc.com> | 2023-01-09 10:58:29 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@yahooinc.com> | 2023-01-09 11:03:04 +0100 |
commit | 0aaf427c8f397bef63c1ae44b41ad7ec21c67e2d (patch) | |
tree | 1d5a4e2d6dd4f30c174082ec530b667554d9dc0a /controller-server | |
parent | 0b6b0a0fac5be32663a2e3b8d34633d9e6097212 (diff) |
Add cause to reindexing notification
Diffstat (limited to 'controller-server')
3 files changed, 47 insertions, 25 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java index f196ca610c8..c48f6a34441 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java @@ -7,6 +7,7 @@ import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.ApplicationReindexing; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics; import com.yahoo.yolean.Exceptions; @@ -68,7 +69,8 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer { lockedInstance -> lockedInstance.with(existingDeployment.zone(), newMetrics) .recordActivityAt(now, existingDeployment.zone()))); - controller().notificationsDb().setDeploymentMetricsNotifications(deploymentId, clusterMetrics); + ApplicationReindexing applicationReindexing = controller().serviceRegistry().configServer().getReindexing(deploymentId); + controller().notificationsDb().setDeploymentMetricsNotifications(deploymentId, clusterMetrics, applicationReindexing); }); } catch (Exception e) { failures.incrementAndGet(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java index a2960754570..cc84e012960 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java @@ -9,6 +9,7 @@ import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.ApplicationReindexing; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; import java.time.Clock; @@ -20,6 +21,7 @@ import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.Stream; +import static com.yahoo.vespa.hosted.controller.api.integration.configserver.ApplicationReindexing.Cluster; import static com.yahoo.vespa.hosted.controller.notification.Notification.Level; import static com.yahoo.vespa.hosted.controller.notification.Notification.Type; @@ -116,17 +118,18 @@ public class NotificationsDb { * - that are (Level.error) or are nearly (Level.warning) feed blocked, * - that are (Level.info) currently reindexing at least 1 document type. */ - public void setDeploymentMetricsNotifications(DeploymentId deploymentId, List<ClusterMetrics> clusterMetrics) { + public void setDeploymentMetricsNotifications(DeploymentId deploymentId, List<ClusterMetrics> clusterMetrics, ApplicationReindexing applicationReindexing) { Instant now = clock.instant(); List<Notification> changed = List.of(); List<Notification> newNotifications = clusterMetrics.stream() .flatMap(metric -> { NotificationSource source = NotificationSource.from(deploymentId, ClusterSpec.Id.from(metric.getClusterId())); + Cluster cluster = applicationReindexing.clusters().getOrDefault(metric.getClusterId(), Cluster.EMPTY); return Stream.of(createFeedBlockNotification(source, now, metric), - createReindexNotification(source, now, metric)); + createReindexNotification(source, now, cluster)); }) .flatMap(Optional::stream) - .collect(Collectors.toUnmodifiableList()); + .toList(); NotificationSource deploymentSource = NotificationSource.from(deploymentId); try (Mutex lock = curatorDb.lockNotifications(deploymentSource.tenant())) { @@ -139,7 +142,7 @@ public class NotificationsDb { !deploymentSource.contains(notification.source())), // ... and add the new notifications for this deployment newNotifications.stream()) - .collect(Collectors.toUnmodifiableList()); + .toList(); if (!initial.equals(updated)) { curatorDb.writeNotifications(deploymentSource.tenant(), updated); } @@ -170,16 +173,18 @@ public class NotificationsDb { List<String> messages = Stream.concat(memoryStatus.stream(), diskStatus.stream()) .filter(status -> status.getFirst() == level) // Do not mix message from different levels .map(Pair::getSecond) - .collect(Collectors.toUnmodifiableList()); + .toList(); return Optional.of(new Notification(at, Type.feedBlock, level, source, messages)); } - private static Optional<Notification> createReindexNotification(NotificationSource source, Instant at, ClusterMetrics metric) { - if (metric.reindexingProgress().isEmpty()) return Optional.empty(); - List<String> messages = metric.reindexingProgress().entrySet().stream() - .map(entry -> Text.format("document type '%s' (%.1f%% done)", entry.getKey(), 100 * entry.getValue())) + private static Optional<Notification> createReindexNotification(NotificationSource source, Instant at, Cluster cluster) { + List<String> messages = cluster.ready().entrySet().stream() + .filter(entry -> entry.getValue().progress().isPresent()) + .map(entry -> Text.format("document type '%s'%s (%.1f%% done)", + entry.getKey(), entry.getValue().cause().map(s -> " " + s).orElse(""), 100 * entry.getValue().progress().get())) .sorted() - .collect(Collectors.toUnmodifiableList()); + .toList(); + if (messages.isEmpty()) return Optional.empty(); return Optional.of(new Notification(at, Type.reindex, Level.info, source, messages)); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java index fb99d86ad5c..834777abb62 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java @@ -14,6 +14,7 @@ import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.ApplicationReindexing; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId; import com.yahoo.vespa.hosted.controller.api.integration.stubs.MockMailer; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; @@ -38,6 +39,7 @@ import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; +import static com.yahoo.vespa.hosted.controller.api.integration.configserver.ApplicationReindexing.Cluster; import static com.yahoo.vespa.hosted.controller.notification.Notification.Level; import static com.yahoo.vespa.hosted.controller.notification.Notification.Type; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -49,6 +51,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; */ public class NotificationsDbTest { + private static final ApplicationReindexing emptyReindexing = new ApplicationReindexing(false, Map.of()); private static final TenantName tenant = TenantName.from("tenant1"); private static final Email email = new Email("user1@example.com", true); private static final CloudTenant cloudTenant = new CloudTenant(tenant, @@ -156,19 +159,19 @@ public class NotificationsDbTest { DeploymentId deploymentId = new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("prod", "us-south-3")); // No metrics, no new notification - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of()); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(), emptyReindexing); assertEquals(0, mailer.inbox(email.getEmailAddress()).size()); // Metrics that contain none of the feed block metrics does not create new notification - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null)), emptyReindexing); assertEquals(0, mailer.inbox(email.getEmailAddress()).size()); // One resource is at warning - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.88, 0.9, 0.3, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.88, 0.9, 0.3, 0.5)), emptyReindexing); assertEquals(1, mailer.inbox(email.getEmailAddress()).size()); // One resource over the limit - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5)), emptyReindexing); assertEquals(2, mailer.inbox(email.getEmailAddress()).size()); } @@ -179,29 +182,29 @@ public class NotificationsDbTest { List<Notification> expected = new ArrayList<>(notifications); // No metrics, no new notification - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of()); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(), emptyReindexing); assertEquals(expected, curatorDb.readNotifications(tenant)); // Metrics that contain none of the feed block metrics does not create new notification - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null)), emptyReindexing); assertEquals(expected, curatorDb.readNotifications(tenant)); // Metrics that only contain util or limit (should not be possible) should not cause any issues - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, null, null, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, null, null, 0.5)), emptyReindexing); assertEquals(expected, curatorDb.readNotifications(tenant)); // One resource is at warning - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.88, 0.9, 0.3, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.88, 0.9, 0.3, 0.5)), emptyReindexing); expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 88.0%, feed block limit: 90.0%)")); assertEquals(expected, curatorDb.readNotifications(tenant)); // Both resources over the limit - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5)), emptyReindexing); expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1, "disk (usage: 95.0%, feed block limit: 90.0%)")); assertEquals(expected, curatorDb.readNotifications(tenant)); // One resource at warning, one at error: Only show error message - notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.7, 0.5))); + notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.7, 0.5)), emptyReindexing); expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1, "memory (usage: 70.0%, feed block limit: 50.0%)", "disk (usage: 95.0%, feed block limit: 90.0%)")); assertEquals(expected, curatorDb.readNotifications(tenant)); @@ -216,19 +219,27 @@ public class NotificationsDbTest { List<Notification> expected = new ArrayList<>(notifications); // Cluster1 and cluster2 are having feed block issues, cluster 3 is reindexing + ApplicationReindexing applicationReindexing1 = new ApplicationReindexing(true, Map.of( + "cluster3", new Cluster(Map.of(), Map.of( + "announcements", reindexingStatus("reindexing due to a schema change", 0.75), + "build", reindexingStatus(null, 0.50))))); notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of( - clusterMetrics("cluster1", 0.88, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.1, 0.8, 0.2, 0.9))); + clusterMetrics("cluster1", 0.88, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.1, 0.8, 0.2, 0.9)), applicationReindexing1); expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 88.0%, feed block limit: 90.0%)")); expected.add(notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)")); - expected.add(notification(12345, Type.reindex, Level.info, sourceCluster3, "document type 'announcements' (75.0% done)", "document type 'build' (50.0% done)")); + expected.add(notification(12345, Type.reindex, Level.info, sourceCluster3, "document type 'announcements' reindexing due to a schema change (75.0% done)", "document type 'build' (50.0% done)")); assertEquals(expected, curatorDb.readNotifications(tenant)); // Cluster1 improves, while cluster3 starts having feed block issues and finishes reindexing 'build' documents + ApplicationReindexing applicationReindexing2 = new ApplicationReindexing(true, Map.of( + "cluster3", new Cluster(Map.of(), Map.of( + "announcements", reindexingStatus("reindexing due to a schema change", 0.90), + "build", reindexingStatus(null, null))))); notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of( - clusterMetrics("cluster1", 0.15, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.78, 0.8, 0.2, 0.9))); + clusterMetrics("cluster1", 0.15, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.78, 0.8, 0.2, 0.9)), applicationReindexing2); expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)")); expected.set(7, notification(12345, Type.feedBlock, Level.warning, sourceCluster3, "disk (usage: 78.0%, feed block limit: 80.0%)")); - expected.set(8, notification(12345, Type.reindex, Level.info, sourceCluster3, "document type 'announcements' (90.0% done)")); + expected.set(8, notification(12345, Type.reindex, Level.info, sourceCluster3, "document type 'announcements' reindexing due to a schema change (90.0% done)")); assertEquals(expected, curatorDb.readNotifications(tenant)); } @@ -256,4 +267,8 @@ public class NotificationsDbTest { if (memoryLimit != null) metrics.put(ClusterMetrics.MEMORY_FEED_BLOCK_LIMIT, memoryLimit); return new ClusterMetrics(clusterId, "content", metrics); } + + private static ApplicationReindexing.Status reindexingStatus(String causeOrNull, Double progressOrNull) { + return new ApplicationReindexing.Status(null, null, null, null, null, progressOrNull, null, causeOrNull); + } } |