summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java7
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java65
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java6
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java3
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java34
6 files changed, 72 insertions, 45 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java
index 0e11bcdccaf..ee74aca0e14 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/application/v4/model/ClusterMetrics.java
@@ -23,11 +23,13 @@ public class ClusterMetrics {
private final String clusterId;
private final String clusterType;
private final Map<String, Double> metrics;
+ private final Map<String, Double> reindexingProgress;
- public ClusterMetrics(String clusterId, String clusterType, Map<String, Double> metrics) {
+ public ClusterMetrics(String clusterId, String clusterType, Map<String, Double> metrics, Map<String, Double> reindexingProgress) {
this.clusterId = clusterId;
this.clusterType = clusterType;
this.metrics = Map.copyOf(metrics);
+ this.reindexingProgress = Map.copyOf(reindexingProgress);
}
public String getClusterId() {
@@ -74,4 +76,7 @@ public class ClusterMetrics {
return Optional.ofNullable(metrics.get(DISK_FEED_BLOCK_LIMIT));
}
+ public Map<String, Double> reindexingProgress() {
+ return reindexingProgress;
+ }
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
index 20154c4f122..ba4aaf92fc8 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
@@ -69,7 +69,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer {
lockedInstance -> lockedInstance.with(existingDeployment.zone(), newMetrics)
.recordActivityAt(now, existingDeployment.zone())));
- controller().notificationsDb().setDeploymentFeedingBlockedNotifications(deploymentId, clusterMetrics);
+ controller().notificationsDb().setDeploymentMetricsNotifications(deploymentId, clusterMetrics);
});
} catch (Exception e) {
failures.incrementAndGet();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java
index beb771b1aa2..7c2d990750c 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDb.java
@@ -124,31 +124,22 @@ public class NotificationsDb {
}
/**
- * Updates feeding blocked notifications for the given deployment based on current cluster metrics.
- * Will clear notifications of any cluster not reporting the metrics or whose metrics indicate feed is not blocked,
- * while setting notifications for cluster that are (Level.error) or are nearly (Level.warning) feed blocked.
+ * Updates notifications based on deployment metrics (e.g. feed blocked and reindexing progress) for the given
+ * deployment based on current cluster metrics.
+ * Will clear notifications of any cluster not reporting the metrics or whose metrics indicate feed is not blocked
+ * or reindexing no longer in progress. Will set notification for clusters:
+ * - that are (Level.error) or are nearly (Level.warning) feed blocked,
+ * - that are (Level.info) currently reindexing at least 1 document type.
*/
- public void setDeploymentFeedingBlockedNotifications(DeploymentId deploymentId, List<ClusterMetrics> clusterMetrics) {
+ public void setDeploymentMetricsNotifications(DeploymentId deploymentId, List<ClusterMetrics> clusterMetrics) {
Instant now = clock.instant();
- List<Notification> feedBlockNotifications = clusterMetrics.stream()
+ List<Notification> newNotifications = clusterMetrics.stream()
.flatMap(metric -> {
- Optional<Pair<Level, String>> memoryStatus =
- resourceUtilToFeedBlockStatus("memory", metric.memoryUtil(), metric.memoryFeedBlockLimit());
- Optional<Pair<Level, String>> diskStatus =
- resourceUtilToFeedBlockStatus("disk", metric.diskUtil(), metric.diskFeedBlockLimit());
- if (memoryStatus.isEmpty() && diskStatus.isEmpty()) return Stream.empty();
-
- // Find the max among levels
- Level level = Stream.of(memoryStatus, diskStatus)
- .flatMap(status -> status.stream().map(Pair::getFirst))
- .max(Comparator.comparing(Enum::ordinal)).get();
- List<String> messages = Stream.concat(memoryStatus.stream(), diskStatus.stream())
- .filter(status -> status.getFirst() == level) // Do not mix message from different levels
- .map(Pair::getSecond)
- .collect(Collectors.toUnmodifiableList());
NotificationSource source = NotificationSource.from(deploymentId, ClusterSpec.Id.from(metric.getClusterId()));
- return Stream.of(new Notification(now, Type.feedBlock, level, source, messages));
+ return Stream.of(createFeedBlockNotification(source, now, metric),
+ createReindexNotification(source, now, metric));
})
+ .flatMap(Optional::stream)
.collect(Collectors.toUnmodifiableList());
NotificationSource deploymentSource = NotificationSource.from(deploymentId);
@@ -157,10 +148,11 @@ public class NotificationsDb {
List<Notification> updated = Stream.concat(
initial.stream()
.filter(notification ->
- // Filter out old feed block notifications for this deployment
- notification.type() != Type.feedBlock || !deploymentSource.contains(notification.source())),
+ // Filter out old feed block notifications and reindex for this deployment
+ (notification.type() != Type.feedBlock && notification.type() != Type.reindex) ||
+ !deploymentSource.contains(notification.source())),
// ... and add the new notifications for this deployment
- feedBlockNotifications.stream())
+ newNotifications.stream())
.collect(Collectors.toUnmodifiableList());
if (!initial.equals(updated))
@@ -168,6 +160,33 @@ public class NotificationsDb {
}
}
+ private static Optional<Notification> createFeedBlockNotification(NotificationSource source, Instant at, ClusterMetrics metric) {
+ Optional<Pair<Level, String>> memoryStatus =
+ resourceUtilToFeedBlockStatus("memory", metric.memoryUtil(), metric.memoryFeedBlockLimit());
+ Optional<Pair<Level, String>> diskStatus =
+ resourceUtilToFeedBlockStatus("disk", metric.diskUtil(), metric.diskFeedBlockLimit());
+ if (memoryStatus.isEmpty() && diskStatus.isEmpty()) return Optional.empty();
+
+ // Find the max among levels
+ Level level = Stream.of(memoryStatus, diskStatus)
+ .flatMap(status -> status.stream().map(Pair::getFirst))
+ .max(Comparator.comparing(Enum::ordinal)).get();
+ List<String> messages = Stream.concat(memoryStatus.stream(), diskStatus.stream())
+ .filter(status -> status.getFirst() == level) // Do not mix message from different levels
+ .map(Pair::getSecond)
+ .collect(Collectors.toUnmodifiableList());
+ return Optional.of(new Notification(at, Type.feedBlock, level, source, messages));
+ }
+
+ private static Optional<Notification> createReindexNotification(NotificationSource source, Instant at, ClusterMetrics metric) {
+ if (metric.reindexingProgress().isEmpty()) return Optional.empty();
+ List<String> messages = metric.reindexingProgress().entrySet().stream()
+ .map(entry -> String.format("document type '%s' (%.1f%% done)", entry.getKey(), 100 * entry.getValue()))
+ .sorted()
+ .collect(Collectors.toUnmodifiableList());
+ return Optional.of(new Notification(at, Type.reindex, Level.info, source, messages));
+ }
+
/**
* Returns a feed block summary for the given resource: the notification level and
* notification message for the given resource utilization wrt. given resource limit.
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java
index 59fb5b596f1..c45aaa563e1 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainerTest.java
@@ -118,8 +118,8 @@ public class DeploymentMetricsMaintainerTest {
@Test
public void cluster_metric_aggregation_test() {
List<ClusterMetrics> clusterMetrics = List.of(
- new ClusterMetrics("niceCluster", "container", Map.of("queriesPerSecond", 23.0, "queryLatency", 1337.0)),
- new ClusterMetrics("alsoNiceCluster", "container", Map.of("queriesPerSecond", 11.0, "queryLatency", 12.0)));
+ new ClusterMetrics("niceCluster", "container", Map.of("queriesPerSecond", 23.0, "queryLatency", 1337.0), Map.of()),
+ new ClusterMetrics("alsoNiceCluster", "container", Map.of("queriesPerSecond", 11.0, "queryLatency", 12.0), Map.of()));
DeploymentMetrics deploymentMetrics = DeploymentMetricsMaintainer.updateDeploymentMetrics(DeploymentMetrics.none, clusterMetrics);
@@ -131,7 +131,7 @@ public class DeploymentMetricsMaintainerTest {
}
private void setMetrics(ApplicationId application, Map<String, Double> metrics) {
- var clusterMetrics = new ClusterMetrics("default", "container", metrics);
+ var clusterMetrics = new ClusterMetrics("default", "container", metrics, Map.of());
tester.controllerTester().serviceRegistry().configServerMock().setMetrics(new DeploymentId(application, ZoneId.from("dev", "us-east-1")), clusterMetrics);
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java
index 7b4882de3ff..29d77c38b1a 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java
@@ -16,7 +16,6 @@ import java.time.Duration;
import java.util.Map;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
/**
* Tests the traffic fraction updater. This also tests its dependency on DeploymentMetricsMaintainer.
@@ -82,7 +81,7 @@ public class TrafficShareUpdaterTest {
}
private void setQpsMetric(double qps, ApplicationId application, ZoneId zone, DeploymentTester tester) {
- var clusterMetrics = new ClusterMetrics("default", "container", Map.of(ClusterMetrics.QUERIES_PER_SECOND, qps));
+ var clusterMetrics = new ClusterMetrics("default", "container", Map.of(ClusterMetrics.QUERIES_PER_SECOND, qps), Map.of());
tester.controllerTester().serviceRegistry().configServerMock().setMetrics(new DeploymentId(application, zone), clusterMetrics);
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java
index f6c17ffcef1..454a4f81524 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/notification/NotificationsDbTest.java
@@ -107,54 +107,56 @@ public class NotificationsDbTest {
List<Notification> expected = new ArrayList<>(notifications);
// No metrics, no new notification
- notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of());
+ notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of());
assertEquals(expected, curatorDb.readNotifications(tenant));
// Metrics that contain none of the feed block metrics does not create new notification
- notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null)));
+ notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", null, null, null, null, Map.of())));
assertEquals(expected, curatorDb.readNotifications(tenant));
// Metrics that only contain util or limit (should not be possible) should not cause any issues
- notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, null, null, 0.5)));
+ notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, null, null, 0.5, Map.of())));
assertEquals(expected, curatorDb.readNotifications(tenant));
// One resource is at warning
- notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5)));
+ notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5, Map.of())));
expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 85.0%, feed block limit: 90.0%)"));
assertEquals(expected, curatorDb.readNotifications(tenant));
// Both resources over the limit
- notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5)));
+ notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.3, 0.5, Map.of())));
expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1, "disk (usage: 95.0%, feed block limit: 90.0%)"));
assertEquals(expected, curatorDb.readNotifications(tenant));
// One resource at warning, one at error: Only show error message
- notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.7, 0.5)));
+ notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(clusterMetrics("cluster1", 0.95, 0.9, 0.7, 0.5, Map.of())));
expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster1,
"memory (usage: 70.0%, feed block limit: 50.0%)", "disk (usage: 95.0%, feed block limit: 90.0%)"));
assertEquals(expected, curatorDb.readNotifications(tenant));
}
@Test
- public void feed_blocked_multiple_cluster_test() {
+ public void deployment_metrics_multiple_cluster_test() {
DeploymentId deploymentId = new DeploymentId(ApplicationId.from(tenant.value(), "app1", "instance1"), ZoneId.from("prod", "us-south-3"));
NotificationSource sourceCluster1 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster1"));
NotificationSource sourceCluster2 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster2"));
NotificationSource sourceCluster3 = NotificationSource.from(deploymentId, ClusterSpec.Id.from("cluster3"));
List<Notification> expected = new ArrayList<>(notifications);
- // Cluster1 and cluster2 are having issues
- notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(
- clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.1, 0.8, 0.2, 0.9)));
+ // Cluster1 and cluster2 are having feed block issues, cluster 3 is reindexing
+ notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(
+ clusterMetrics("cluster1", 0.85, 0.9, 0.3, 0.5, Map.of()), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75, Map.of()), clusterMetrics("cluster3", 0.1, 0.8, 0.2, 0.9, Map.of("announcements", 0.75, "build", 0.5))));
expected.add(notification(12345, Type.feedBlock, Level.warning, sourceCluster1, "disk (usage: 85.0%, feed block limit: 90.0%)"));
expected.add(notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)"));
+ expected.add(notification(12345, Type.reindex, Level.info, sourceCluster3, "document type 'announcements' (75.0% done)", "document type 'build' (50.0% done)"));
assertEquals(expected, curatorDb.readNotifications(tenant));
- // Cluster1 improves, while cluster3 starts having issues
- notificationsDb.setDeploymentFeedingBlockedNotifications(deploymentId, List.of(
- clusterMetrics("cluster1", 0.15, 0.9, 0.3, 0.5), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75), clusterMetrics("cluster3", 0.75, 0.8, 0.2, 0.9)));
+ // Cluster1 improves, while cluster3 starts having feed block issues and finishes reindexing 'build' documents
+ notificationsDb.setDeploymentMetricsNotifications(deploymentId, List.of(
+ clusterMetrics("cluster1", 0.15, 0.9, 0.3, 0.5, Map.of()), clusterMetrics("cluster2", 0.6, 0.8, 0.9, 0.75, Map.of()), clusterMetrics("cluster3", 0.75, 0.8, 0.2, 0.9, Map.of("announcements", 0.9))));
expected.set(6, notification(12345, Type.feedBlock, Level.error, sourceCluster2, "memory (usage: 90.0%, feed block limit: 75.0%)"));
expected.set(7, notification(12345, Type.feedBlock, Level.warning, sourceCluster3, "disk (usage: 75.0%, feed block limit: 80.0%)"));
+ expected.set(8, notification(12345, Type.reindex, Level.info, sourceCluster3, "document type 'announcements' (90.0% done)"));
assertEquals(expected, curatorDb.readNotifications(tenant));
}
@@ -206,12 +208,14 @@ public class NotificationsDbTest {
return new Notification(Instant.ofEpochSecond(secondsSinceEpoch), type, level, source, List.of(messages));
}
- private static ClusterMetrics clusterMetrics(String clusterId, Double diskUtil, Double diskLimit, Double memoryUtil, Double memoryLimit) {
+ private static ClusterMetrics clusterMetrics(String clusterId,
+ Double diskUtil, Double diskLimit, Double memoryUtil, Double memoryLimit,
+ Map<String, Double> reindexingProgress) {
Map<String, Double> metrics = new HashMap<>();
if (diskUtil != null) metrics.put(ClusterMetrics.DISK_UTIL, diskUtil);
if (diskLimit != null) metrics.put(ClusterMetrics.DISK_FEED_BLOCK_LIMIT, diskLimit);
if (memoryUtil != null) metrics.put(ClusterMetrics.MEMORY_UTIL, memoryUtil);
if (memoryLimit != null) metrics.put(ClusterMetrics.MEMORY_FEED_BLOCK_LIMIT, memoryLimit);
- return new ClusterMetrics(clusterId, "content", metrics);
+ return new ClusterMetrics(clusterId, "content", metrics, reindexingProgress);
}
}