summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorValerij Fredriksen <valerij92@gmail.com>2021-04-29 21:07:17 +0200
committerValerij Fredriksen <valerij92@gmail.com>2021-04-29 21:07:37 +0200
commit906a8b24c29bcb03bfecf45606676332df26d61b (patch)
tree44af53f2030a0a3b2e4ee892ec9e763481de949e /controller-server
parente1fad8e05389ffff936667a7f10fec27aa933982 (diff)
Add feed blocked notification
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java11
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java68
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java1
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java101
6 files changed, 172 insertions, 17 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
index cb431158344..6bda6dc8a25 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
@@ -58,9 +58,10 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer {
attempts.incrementAndGet();
try {
if (deployment.version().getMajor() < 7) continue;
- List<ClusterMetrics> clusterMetrics = controller().serviceRegistry().configServer()
- .getDeploymentMetrics(new DeploymentId(instance.id(), deployment.zone()));
+ DeploymentId deploymentId = new DeploymentId(instance.id(), deployment.zone());
+ List<ClusterMetrics> clusterMetrics = controller().serviceRegistry().configServer().getDeploymentMetrics(deploymentId);
Instant now = controller().clock().instant();
+
applications.lockApplicationIfPresent(application.id(), locked -> {
Deployment existingDeployment = locked.get().require(instance.name()).deployments().get(deployment.zone());
if (existingDeployment == null) return; // Deployment removed since we started collecting metrics
@@ -69,6 +70,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer {
lockedInstance -> lockedInstance.with(existingDeployment.zone(), newMetrics)
.recordActivityAt(now, existingDeployment.zone())));
+ controller().notificationsDb().setDeploymentFeedingBlockedNotifications(deploymentId, clusterMetrics);
});
} catch (Exception e) {
failures.incrementAndGet();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java
index 55a769b5960..ea0422ea9fc 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/Notification.java
@@ -6,6 +6,11 @@ import java.util.List;
import java.util.Objects;
/**
+ * Represents an event that we want to notify the tenant about. The message(s) should be short
+ * and only describe event details: the final presentation will prefix the message with general
+ * information from other metadata in this notification (e.g. links to relevant console views
+ * and/or relevant documentation.
+ *
* @author freva
*/
public class Notification {
@@ -56,6 +61,7 @@ public class Notification {
}
public enum Level {
+ // Must be ordered in order of importance
warning, error
}
@@ -64,7 +70,10 @@ public class Notification {
applicationPackage,
/** Related to deployment of application, e.g. system test failure, out of capacity, internal errors, etc. */
- deployment
+ deployment,
+
+ /** Application cluster is (near) external feed blocked */
+ feedBlock;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java
index f61d59552c4..9fb39640833 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java
@@ -1,6 +1,8 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.notification;
+import com.yahoo.collections.Pair;
+import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.vespa.curator.Lock;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics;
@@ -8,9 +10,13 @@ import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId;
import com.yahoo.vespa.hosted.controller.persistence.CuratorDb;
import java.time.Clock;
+import java.time.Instant;
import java.util.ArrayList;
+import java.util.Comparator;
import java.util.List;
+import java.util.Optional;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
import static com.yahoo.vespa.hosted.controller.notification.Notification.Level;
import static com.yahoo.vespa.hosted.controller.notification.Notification.Type;
@@ -86,4 +92,66 @@ public class NotificationsDb {
curatorDb.writeNotifications(source.tenant(), filtered);
}
}
+
+ /**
+ * Updates feeding blocked notifications for the given deployment based on current cluster metrics.
+ * Will clear notifications of any cluster not reporting the metrics or whose metrics indicate feed is not blocked,
+ * while setting notifications for cluster that are (Level.error) or are nearly (Level.warning) feed blocked.
+ */
+ public void setDeploymentFeedingBlockedNotifications(DeploymentId deploymentId, List<ClusterMetrics> clusterMetrics) {
+ Instant now = clock.instant();
+ List<Notification> feedBlockNotifications = clusterMetrics.stream()
+ .flatMap(metric -> {
+ Optional<Pair<Level, String>> memoryStatus =
+ resourceUtilToFeedBlockStatus("memory", metric.memoryUtil(), metric.memoryFeedBlockLimit());
+ Optional<Pair<Level, String>> diskStatus =
+ resourceUtilToFeedBlockStatus("disk", metric.diskUtil(), metric.diskFeedBlockLimit());
+ if (memoryStatus.isEmpty() && diskStatus.isEmpty()) return Stream.empty();
+
+ // Find the max among levels
+ Level level = Stream.of(memoryStatus, diskStatus)
+ .flatMap(status -> status.stream().map(Pair::getFirst))
+ .max(Comparator.comparing(Enum::ordinal)).get();
+ List<String> messages = Stream.concat(memoryStatus.stream(), diskStatus.stream())
+ .filter(status -> status.getFirst() == level) // Do not mix message from different levels
+ .map(Pair::getSecond)
+ .collect(Collectors.toUnmodifiableList());
+ NotificationSource source = NotificationSource.from(deploymentId, ClusterSpec.Id.from(metric.getClusterId()));
+ return Stream.of(new Notification(now, Type.feedBlock, level, source, messages));
+ })
+ .collect(Collectors.toUnmodifiableList());
+
+ NotificationSource deploymentSource = NotificationSource.from(deploymentId);
+ try (Lock lock = curatorDb.lockNotifications(deploymentSource.tenant())) {
+ List<Notification> initial = curatorDb.readNotifications(deploymentSource.tenant());
+ List<Notification> updated = Stream.concat(
+ initial.stream()
+ .filter(notification ->
+ // Filter out old feed block notifications for this deployment
+ notification.type() != Type.feedBlock || !deploymentSource.contains(notification.source())),
+ // ... and add the new notifications for this deployment
+ feedBlockNotifications.stream())
+ .collect(Collectors.toUnmodifiableList());
+
+ if (!initial.equals(updated))
+ curatorDb.writeNotifications(deploymentSource.tenant(), updated);
+ }
+ }
+
+ /**
+ * Returns a feed block summary for the given resource: the notification level and
+ * notification message for the given resource utilization wrt. given resource limit.
+ * If utilization is well below the limit, Optional.empty() is returned.
+ */
+ private static Optional<Pair<Level, String>> resourceUtilToFeedBlockStatus(
+ String resource, Optional<Double> util, Optional<Double> feedBlockLimit) {
+ if (util.isEmpty() || feedBlockLimit.isEmpty()) return Optional.empty();
+ double utilRelativeToLimit = util.get() / feedBlockLimit.get();
+ if (utilRelativeToLimit < 0.9) return Optional.empty();
+
+ String message = String.format("%s (usage: %.1f%%, feed block limit: %.1f%%)",
+ resource, 100 * util.get(), 100 * feedBlockLimit.get());
+ if (utilRelativeToLimit < 1) return Optional.of(new Pair<>(Level.warning, message));
+ return Optional.of(new Pair<>(Level.error, message));
+ }
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java
index 1cadfd67e77..c0b88bb3b86 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/NotificationsSerializer.java
@@ -93,6 +93,7 @@ public class NotificationsSerializer {
switch (type) {
case applicationPackage: return "applicationPackage";
case deployment: return "deployment";
+ case feedBlock: return "feedBlock";
default: throw new IllegalArgumentException("No serialization defined for notification type " + type);
}
}
@@ -103,6 +104,7 @@ public class NotificationsSerializer {
case "applicationPackage": return Notification.Type.applicationPackage;
case "DEPLOYMENT_FAILURE": // TODO (valerijf): Remove after 7.398+
case "deployment": return Notification.Type.deployment;
+ case "feedBlock": return Notification.Type.feedBlock;
default: throw new IllegalArgumentException("Unknown serialized notification type value '" + field.asString() + "'");
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
index 1a5d3a873fa..2a7132c08d6 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
@@ -517,6 +517,7 @@ public class ApplicationApiHandler extends AuditLoggingRequestHandler {
switch (type) {
case applicationPackage: return "applicationPackage";
case deployment: return "deployment";
+ case feedBlock: return "feedBlock";
default: throw new IllegalArgumentException("No serialization defined for notification type " + type);
}
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java
index c5e35296f04..a42e3d95167 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java
@@ -7,6 +7,7 @@ import com.yahoo.config.provision.TenantName;
import com.yahoo.config.provision.zone.ZoneId;
import com.yahoo.path.Path;
import com.yahoo.test.ManualClock;
+import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics;
import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId;
@@ -18,13 +19,18 @@ import org.junit.Test;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+import static com.yahoo.vespa.hosted.controller.notification.Notification.Level;
+import static com.yahoo.vespa.hosted.controller.notification.Notification.Type;
+
/**
* @author freva
*/
@@ -32,12 +38,12 @@ public class NotificationsDbTest {
private static final TenantName tenant = TenantName.from("tenant1");
private static final List<Notification> notifications = List.of(
- notification(1001, Notification.Type.deployment, NotificationSource.from(tenant), "tenant msg"),
- notification(1101, Notification.Type.deployment, NotificationSource.from(TenantAndApplicationId.from(tenant.value(), "app1")), "app msg"),
- notification(1201, Notification.Type.deployment, NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), "instance msg"),
- notification(1301, Notification.Type.deployment, NotificationSource.from(new DeploymentId(ApplicationId.from(tenant.value(), "app2", "instance2"), ZoneId.from("prod", "us-north-2"))), "deployment msg"),
- notification(1401, Notification.Type.deployment, NotificationSource.from(new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("dev", "us-south-1")), ClusterSpec.Id.from("cluster1")), "cluster msg"),
- notification(1501, Notification.Type.deployment, NotificationSource.from(new RunId(ApplicationId.from(tenant.value(), "app1", "instance1"), JobType.devUsEast1, 4)), "run id msg"));
+ notification(1001, Type.deployment, Level.error, NotificationSource.from(tenant), "tenant msg"),
+ notification(1101, Type.applicationPackage, Level.warning, NotificationSource.from(TenantAndApplicationId.from(tenant.value(), "app1")), "app msg"),
+ notification(1201, Type.deployment, Level.error, NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), "instance msg"),
+ notification(1301, Type.deployment, Level.warning, NotificationSource.from(new DeploymentId(ApplicationId.from(tenant.value(), "app2", "instance2"), ZoneId.from("prod", "us-north-2"))), "deployment msg"),
+ notification(1401, Type.feedBlock, Level.error, NotificationSource.from(new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("dev", "us-south-1")), ClusterSpec.Id.from("cluster1")), "cluster msg"),
+ notification(1501, Type.deployment, Level.warning, NotificationSource.from(new RunId(ApplicationId.from(tenant.value(), "app1", "instance1"), JobType.devUsEast1, 4)), "run id msg"));
private final ManualClock clock = new ManualClock(Instant.ofEpochSecond(12345));
private final MockCuratorDb curatorDb = new MockCuratorDb();
@@ -55,14 +61,14 @@ public class NotificationsDbTest {
@Test
public void add_test() {
- Notification notification1 = notification(12345, Notification.Type.deployment, NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), "instance msg #2");
- Notification notification2 = notification(12345, Notification.Type.deployment, NotificationSource.from(ApplicationId.from(tenant.value(), "app3", "instance2")), "instance msg #3");
+ Notification notification1 = notification(12345, Type.deployment, Level.warning, NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), "instance msg #2");
+ Notification notification2 = notification(12345, Type.deployment, Level.error, NotificationSource.from(ApplicationId.from(tenant.value(), "app3", "instance2")), "instance msg #3");
// Replace the 3rd notification
- notificationsDb.setNotification(notification1.source(), notification1.type(), Notification.Level.warning, notification1.messages());
+ notificationsDb.setNotification(notification1.source(), notification1.type(), notification1.level(), notification1.messages());
// Notification for a new app, add without replacement
- notificationsDb.setNotification(notification2.source(), notification2.type(), Notification.Level.warning, notification2.messages());
+ notificationsDb.setNotification(notification2.source(), notification2.type(), notification2.level(), notification2.messages());
List<Notification> expected = notificationIndices(0, 1, 3, 4, 5);
expected.addAll(List.of(notification1, notification2));
@@ -72,10 +78,10 @@ public class NotificationsDbTest {
@Test
public void remove_single_test() {
// Remove the 3rd notification
- notificationsDb.removeNotification(NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), Notification.Type.deployment);
+ notificationsDb.removeNotification(NotificationSource.from(ApplicationId.from(tenant.value(), "app2", "instance2")), Type.deployment);
// Removing something that doesn't exist is OK
- notificationsDb.removeNotification(NotificationSource.from(ApplicationId.from(tenant.value(), "app3", "instance2")), Notification.Type.deployment);
+ notificationsDb.removeNotification(NotificationSource.from(ApplicationId.from(tenant.value(), "app3", "instance2")), Type.deployment);
assertEquals(notificationIndices(0, 1, 3, 4, 5), curatorDb.readNotifications(tenant));
}
@@ -92,6 +98,64 @@ public class NotificationsDbTest {
assertFalse(curatorDb.curator().exists(Path.fromString("/controller/v1/notifications/" + tenant.value())));
}
+ @Test
+ public void feed_blocked_single_cluster_test() {
+ DeploymentId deploymentId = new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("prod", "us-south-3"));
+ NotificationSource sourceCluster1 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster1"));
+ List<Notification> expected = new ArrayList<>(notifications);
+
+ // No metrics, no new notification
+ notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of());
+ assertEquals(expected, curatorDb.readNotifications(tenant));
+
+ // Metrics that contain none of the feed block metrics does not create new notification
+ notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null)));
+ assertEquals(expected, curatorDb.readNotifications(tenant));
+
+ // Metrics that only contain util or limit (should not be possible) should not cause any issues
+ notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, null, null, 0.5)));
+ assertEquals(expected, curatorDb.readNotifications(tenant));
+
+ // One resource is at warning
+ notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5)));
+ expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 85.0%, feed block limit: 90.0%)"));
+ assertEquals(expected, curatorDb.readNotifications(tenant));
+
+ // Both resources over the limit
+ notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5)));
+ expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1, "disk (usage: 95.0%, feed block limit: 90.0%)"));
+ assertEquals(expected, curatorDb.readNotifications(tenant));
+
+ // One resource at warning, one at error: Only show error message
+ notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.7, 0.5)));
+ expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1,
+ "memory (usage: 70.0%, feed block limit: 50.0%)", "disk (usage: 95.0%, feed block limit: 90.0%)"));
+ assertEquals(expected, curatorDb.readNotifications(tenant));
+ }
+
+ @Test
+ public void feed_blocked_multiple_cluster_test() {
+ DeploymentId deploymentId = new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("prod", "us-south-3"));
+ NotificationSource sourceCluster1 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster1"));
+ NotificationSource sourceCluster2 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster2"));
+ NotificationSource sourceCluster3 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster3"));
+ List<Notification> expected = new ArrayList<>(notifications);
+
+ // Cluster1 and cluster2 are having issues
+ notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(
+ clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.1, 0.8, 0.2, 0.9)));
+ expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 85.0%, feed block limit: 90.0%)"));
+ expected.add(notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)"));
+ assertEquals(expected, curatorDb.readNotifications(tenant));
+
+ // Cluster1 improves, while cluster3 starts having issues
+ notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(
+ clusterMetrics("cluster1", 0.15, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.75, 0.8, 0.2, 0.9)));
+ expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)"));
+ expected.set(7, notification(12345, Type.feedBlock, Level.warning, sourceCluster3, "disk (usage: 75.0%, feed block limit: 80.0%)"));
+ assertEquals(expected, curatorDb.readNotifications(tenant));
+ }
+
@Before
public void init() {
curatorDb.writeNotifications(tenant, notifications);
@@ -101,7 +165,16 @@ public class NotificationsDbTest {
return Arrays.stream(indices).mapToObj(notifications::get).collect(Collectors.toCollection(ArrayList::new));
}
- private static Notification notification(long secondsSinceEpoch, Notification.Type type, NotificationSource source, String... messages) {
- return new Notification(Instant.ofEpochSecond(secondsSinceEpoch), type, Notification.Level.warning, source, List.of(messages));
+ private static Notification notification(long secondsSinceEpoch, Type type, Level level, NotificationSource source, String... messages) {
+ return new Notification(Instant.ofEpochSecond(secondsSinceEpoch), type, level, source, List.of(messages));
+ }
+
+ private static ClusterMetrics clusterMetrics(String clusterId, Double diskUtil, Double diskLimit, Double memoryUtil, Double memoryLimit) {
+ Map<String, Double> metrics = new HashMap<>();
+ if (diskUtil != null) metrics.put(ClusterMetrics.DISK_UTIL, diskUtil);
+ if (diskLimit != null) metrics.put(ClusterMetrics.DISK_FEED_BLOCK_LIMIT, diskLimit);
+ if (memoryUtil != null) metrics.put(ClusterMetrics.MEMORY_UTIL, memoryUtil);
+ if (memoryLimit != null) metrics.put(ClusterMetrics.MEMORY_FEED_BLOCK_LIMIT, memoryLimit);
+ return new ClusterMetrics(clusterId, "content", metrics);
}
}