diff options
author | Valerij Fredriksen <valerij92@gmail.com> | 2021-04-29 21:07:17 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerij92@gmail.com> | 2021-04-29 21:07:37 +0200 |
commit | 906a8b24c29bcb03bfecf45606676332df26d61b (patch) | |
tree | 44af53f2030a0a3b2e4ee892ec9e763481de949e /controller-server | |
parent | e1fad8e05389ffff936667a7f10fec27aa933982 (diff) |
Add feed blocked notification
Diffstat (limited to 'controller-server')
6 files changed, 172 insertions, 17 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java index cb431158344..6bda6dc8a25 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java @@ -58,9 +58,10 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer { attempts.incrementAndGet(); try { if (deployment.version().getMajor() < 7) continue; - List<ClusterMetrics> clusterMetrics = controller().serviceRegistry().configServer() - .getDeploymentMetrics(new DeploymentId(instance.id(), deployment.zone())); + DeploymentId deploymentId = new DeploymentId(instance.id(), deployment.zone()); + List<ClusterMetrics> clusterMetrics = controller().serviceRegistry().configServer().getDeploymentMetrics(deploymentId); Instant now = controller().clock().instant(); + applications.lockApplicationIfPresent(application.id(), locked -> { Deployment existingDeployment = locked.get().require(instance.name()).deployments().get(deployment.zone()); if (existingDeployment == null) return; // Deployment removed since we started collecting metrics @@ -69,6 +70,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer { lockedInstance -> lockedInstance.with(existingDeployment.zone(), newMetrics) .recordActivityAt(now, existingDeployment.zone()))); + controller().notificationsDb().setDeploymentFeedingBlockedNotifications(deploymentId, clusterMetrics); }); } catch (Exception e) { failures.incrementAndGet(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java index 55a769b5960..ea0422ea9fc 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java @@ -6,6 +6,11 @@ import java.util.List; import java.util.Objects; /** + * Represents an event that we want to notify the tenant about. The message(s) should be short + * and only describe event details: the final presentation will prefix the message with general + * information from other metadata in this notification (e.g. links to relevant console views + * and/or relevant documentation. + * * @author freva */ public class Notification { @@ -56,6 +61,7 @@ public class Notification { } public enum Level { + // Must be ordered in order of importance warning, error } @@ -64,7 +70,10 @@ public class Notification { applicationPackage, /** Related to deployment of application, e.g. system test failure, out of capacity, internal errors, etc. */ - deployment + deployment, + + /** Application cluster is (near) external feed blocked */ + feedBlock; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java index f61d59552c4..9fb39640833 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java @@ -1,6 +1,8 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.notification; +import com.yahoo.collections.Pair; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.vespa.curator.Lock; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics; @@ -8,9 +10,13 @@ import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; import java.time.Clock; +import java.time.Instant; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; +import java.util.Optional; import java.util.stream.Collectors; +import java.util.stream.Stream; import static com.yahoo.vespa.hosted.controller.notification.Notification.Level; import static com.yahoo.vespa.hosted.controller.notification.Notification.Type; @@ -86,4 +92,66 @@ public class NotificationsDb { curatorDb.writeNotifications(source.tenant(), filtered); } } + + /** + * Updates feeding blocked notifications for the given deployment based on current cluster metrics. + * Will clear notifications of any cluster not reporting the metrics or whose metrics indicate feed is not blocked, + * while setting notifications for cluster that are (Level.error) or are nearly (Level.warning) feed blocked. + */ + public void setDeploymentFeedingBlockedNotifications(DeploymentId deploymentId, List<ClusterMetrics> clusterMetrics) { + Instant now = clock.instant(); + List<Notification> feedBlockNotifications = clusterMetrics.stream() + .flatMap(metric -> { + Optional<Pair<Level, String>> memoryStatus = + resourceUtilToFeedBlockStatus("memory", metric.memoryUtil(), metric.memoryFeedBlockLimit()); + Optional<Pair<Level, String>> diskStatus = + resourceUtilToFeedBlockStatus("disk", metric.diskUtil(), metric.diskFeedBlockLimit()); + if (memoryStatus.isEmpty() && diskStatus.isEmpty()) return Stream.empty(); + + // Find the max among levels + Level level = Stream.of(memoryStatus, diskStatus) + .flatMap(status -> status.stream().map(Pair::getFirst)) + .max(Comparator.comparing(Enum::ordinal)).get(); + List<String> messages = Stream.concat(memoryStatus.stream(), diskStatus.stream()) + .filter(status -> status.getFirst() == level) // Do not mix message from different levels + .map(Pair::getSecond) + .collect(Collectors.toUnmodifiableList()); + NotificationSource source = NotificationSource.from(deploymentId, ClusterSpec.Id.from(metric.getClusterId())); + return Stream.of(new Notification(now, Type.feedBlock, level, source, messages)); + }) + .collect(Collectors.toUnmodifiableList()); + + NotificationSource deploymentSource = NotificationSource.from(deploymentId); + try (Lock lock = curatorDb.lockNotifications(deploymentSource.tenant())) { + List<Notification> initial = curatorDb.readNotifications(deploymentSource.tenant()); + List<Notification> updated = Stream.concat( + initial.stream() + .filter(notification -> + // Filter out old feed block notifications for this deployment + notification.type() != Type.feedBlock || !deploymentSource.contains(notification.source())), + // ... and add the new notifications for this deployment + feedBlockNotifications.stream()) + .collect(Collectors.toUnmodifiableList()); + + if (!initial.equals(updated)) + curatorDb.writeNotifications(deploymentSource.tenant(), updated); + } + } + + /** + * Returns a feed block summary for the given resource: the notification level and + * notification message for the given resource utilization wrt. given resource limit. + * If utilization is well below the limit, Optional.empty() is returned. + */ + private static Optional<Pair<Level, String>> resourceUtilToFeedBlockStatus( + String resource, Optional<Double> util, Optional<Double> feedBlockLimit) { + if (util.isEmpty() || feedBlockLimit.isEmpty()) return Optional.empty(); + double utilRelativeToLimit = util.get() / feedBlockLimit.get(); + if (utilRelativeToLimit < 0.9) return Optional.empty(); + + String message = String.format("%s (usage: %.1f%%, feed block limit: %.1f%%)", + resource, 100 * util.get(), 100 * feedBlockLimit.get()); + if (utilRelativeToLimit < 1) return Optional.of(new Pair<>(Level.warning, message)); + return Optional.of(new Pair<>(Level.error, message)); + } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java index 1cadfd67e77..c0b88bb3b86 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java @@ -93,6 +93,7 @@ public class NotificationsSerializer { switch (type) { case applicationPackage: return "applicationPackage"; case deployment: return "deployment"; + case feedBlock: return "feedBlock"; default: throw new IllegalArgumentException("No serialization defined for notification type " + type); } } @@ -103,6 +104,7 @@ public class NotificationsSerializer { case "applicationPackage": return Notification.Type.applicationPackage; case "DEPLOYMENT_FAILURE": // TODO (valerijf): Remove after 7.398+ case "deployment": return Notification.Type.deployment; + case "feedBlock": return Notification.Type.feedBlock; default: throw new IllegalArgumentException("Unknown serialized notification type value '" + field.asString() + "'"); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java index 1a5d3a873fa..2a7132c08d6 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java @@ -517,6 +517,7 @@ public class ApplicationApiHandler extends AuditLoggingRequestHandler { switch (type) { case applicationPackage: return "applicationPackage"; case deployment: return "deployment"; + case feedBlock: return "feedBlock"; default: throw new IllegalArgumentException("No serialization defined for notification type " + type); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java index c5e35296f04..a42e3d95167 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java @@ -7,6 +7,7 @@ import com.yahoo.config.provision.TenantName; import com.yahoo.config.provision.zone.ZoneId; import com.yahoo.path.Path; import com.yahoo.test.ManualClock; +import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId; @@ -18,13 +19,18 @@ import org.junit.Test; import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static com.yahoo.vespa.hosted.controller.notification.Notification.Level; +import static com.yahoo.vespa.hosted.controller.notification.Notification.Type; + /** * @author freva */ @@ -32,12 +38,12 @@ public class NotificationsDbTest { private static final TenantName tenant = TenantName.from("tenant1"); private static final List<Notification> notifications = List.of( - notification(1001, Notification.Type.deployment, NotificationSource.from(tenant), "tenant msg"), - notification(1101, Notification.Type.deployment, NotificationSource.from(TenantAndApplicationId.from(tenant.value(), "app1")), "app msg"), - notification(1201, Notification.Type.deployment, NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), "instance msg"), - notification(1301, Notification.Type.deployment, NotificationSource.from(new DeploymentId(ApplicationId.from(tenant.value(), "app2", "instance2"), ZoneId.from("prod", "us-north-2"))), "deployment msg"), - notification(1401, Notification.Type.deployment, NotificationSource.from(new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("dev", "us-south-1")), ClusterSpec.Id.from("cluster1")), "cluster msg"), - notification(1501, Notification.Type.deployment, NotificationSource.from(new RunId(ApplicationId.from(tenant.value(), "app1", "instance1"), JobType.devUsEast1, 4)), "run id msg")); + notification(1001, Type.deployment, Level.error, NotificationSource.from(tenant), "tenant msg"), + notification(1101, Type.applicationPackage, Level.warning, NotificationSource.from(TenantAndApplicationId.from(tenant.value(), "app1")), "app msg"), + notification(1201, Type.deployment, Level.error, NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), "instance msg"), + notification(1301, Type.deployment, Level.warning, NotificationSource.from(new DeploymentId(ApplicationId.from(tenant.value(), "app2", "instance2"), ZoneId.from("prod", "us-north-2"))), "deployment msg"), + notification(1401, Type.feedBlock, Level.error, NotificationSource.from(new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("dev", "us-south-1")), ClusterSpec.Id.from("cluster1")), "cluster msg"), + notification(1501, Type.deployment, Level.warning, NotificationSource.from(new RunId(ApplicationId.from(tenant.value(), "app1", "instance1"), JobType.devUsEast1, 4)), "run id msg")); private final ManualClock clock = new ManualClock(Instant.ofEpochSecond(12345)); private final MockCuratorDb curatorDb = new MockCuratorDb(); @@ -55,14 +61,14 @@ public class NotificationsDbTest { @Test public void add_test() { - Notification notification1 = notification(12345, Notification.Type.deployment, NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), "instance msg #2"); - Notification notification2 = notification(12345, Notification.Type.deployment, NotificationSource.from(ApplicationId.from(tenant.value(), "app3", "instance2")), "instance msg #3"); + Notification notification1 = notification(12345, Type.deployment, Level.warning, NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), "instance msg #2"); + Notification notification2 = notification(12345, Type.deployment, Level.error, NotificationSource.from(ApplicationId.from(tenant.value(), "app3", "instance2")), "instance msg #3"); // Replace the 3rd notification - notificationsDb.setNotification(notification1.source(), notification1.type(), Notification.Level.warning, notification1.messages()); + notificationsDb.setNotification(notification1.source(), notification1.type(), notification1.level(), notification1.messages()); // Notification for a new app, add without replacement - notificationsDb.setNotification(notification2.source(), notification2.type(), Notification.Level.warning, notification2.messages()); + notificationsDb.setNotification(notification2.source(), notification2.type(), notification2.level(), notification2.messages()); List<Notification> expected = notificationIndices(0, 1, 3, 4, 5); expected.addAll(List.of(notification1, notification2)); @@ -72,10 +78,10 @@ public class NotificationsDbTest { @Test public void remove_single_test() { // Remove the 3rd notification - notificationsDb.removeNotification(NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), Notification.Type.deployment); + notificationsDb.removeNotification(NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), Type.deployment); // Removing something that doesn't exist is OK - notificationsDb.removeNotification(NotificationSource.from(ApplicationId.from(tenant.value(), "app3", "instance2")), Notification.Type.deployment); + notificationsDb.removeNotification(NotificationSource.from(ApplicationId.from(tenant.value(), "app3", "instance2")), Type.deployment); assertEquals(notificationIndices(0, 1, 3, 4, 5), curatorDb.readNotifications(tenant)); } @@ -92,6 +98,64 @@ public class NotificationsDbTest { assertFalse(curatorDb.curator().exists(Path.fromString("/controller/v1/notifications/" + tenant.value()))); } + @Test + public void feed_blocked_single_cluster_test() { + DeploymentId deploymentId = new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("prod", "us-south-3")); + NotificationSource sourceCluster1 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster1")); + List<Notification> expected = new ArrayList<>(notifications); + + // No metrics, no new notification + notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of()); + assertEquals(expected, curatorDb.readNotifications(tenant)); + + // Metrics that contain none of the feed block metrics does not create new notification + notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null))); + assertEquals(expected, curatorDb.readNotifications(tenant)); + + // Metrics that only contain util or limit (should not be possible) should not cause any issues + notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, null, null, 0.5))); + assertEquals(expected, curatorDb.readNotifications(tenant)); + + // One resource is at warning + notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5))); + expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 85.0%, feed block limit: 90.0%)")); + assertEquals(expected, curatorDb.readNotifications(tenant)); + + // Both resources over the limit + notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5))); + expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1, "disk (usage: 95.0%, feed block limit: 90.0%)")); + assertEquals(expected, curatorDb.readNotifications(tenant)); + + // One resource at warning, one at error: Only show error message + notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.7, 0.5))); + expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1, + "memory (usage: 70.0%, feed block limit: 50.0%)", "disk (usage: 95.0%, feed block limit: 90.0%)")); + assertEquals(expected, curatorDb.readNotifications(tenant)); + } + + @Test + public void feed_blocked_multiple_cluster_test() { + DeploymentId deploymentId = new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("prod", "us-south-3")); + NotificationSource sourceCluster1 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster1")); + NotificationSource sourceCluster2 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster2")); + NotificationSource sourceCluster3 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster3")); + List<Notification> expected = new ArrayList<>(notifications); + + // Cluster1 and cluster2 are having issues + notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of( + clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.1, 0.8, 0.2, 0.9))); + expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 85.0%, feed block limit: 90.0%)")); + expected.add(notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)")); + assertEquals(expected, curatorDb.readNotifications(tenant)); + + // Cluster1 improves, while cluster3 starts having issues + notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of( + clusterMetrics("cluster1", 0.15, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.75, 0.8, 0.2, 0.9))); + expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)")); + expected.set(7, notification(12345, Type.feedBlock, Level.warning, sourceCluster3, "disk (usage: 75.0%, feed block limit: 80.0%)")); + assertEquals(expected, curatorDb.readNotifications(tenant)); + } + @Before public void init() { curatorDb.writeNotifications(tenant, notifications); @@ -101,7 +165,16 @@ public class NotificationsDbTest { return Arrays.stream(indices).mapToObj(notifications::get).collect(Collectors.toCollection(ArrayList::new)); } - private static Notification notification(long secondsSinceEpoch, Notification.Type type, NotificationSource source, String... messages) { - return new Notification(Instant.ofEpochSecond(secondsSinceEpoch), type, Notification.Level.warning, source, List.of(messages)); + private static Notification notification(long secondsSinceEpoch, Type type, Level level, NotificationSource source, String... messages) { + return new Notification(Instant.ofEpochSecond(secondsSinceEpoch), type, level, source, List.of(messages)); + } + + private static ClusterMetrics clusterMetrics(String clusterId, Double diskUtil, Double diskLimit, Double memoryUtil, Double memoryLimit) { + Map<String, Double> metrics = new HashMap<>(); + if (diskUtil != null) metrics.put(ClusterMetrics.DISK_UTIL, diskUtil); + if (diskLimit != null) metrics.put(ClusterMetrics.DISK_FEED_BLOCK_LIMIT, diskLimit); + if (memoryUtil != null) metrics.put(ClusterMetrics.MEMORY_UTIL, memoryUtil); + if (memoryLimit != null) metrics.put(ClusterMetrics.MEMORY_FEED_BLOCK_LIMIT, memoryLimit); + return new ClusterMetrics(clusterId, "content", metrics); } } |