summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java22
-rw-r--r--clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java20
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java15
3 files changed, 15 insertions, 42 deletions
diff --git a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java
index 8a48e4ce272..a1aa5287d2f 100644
--- a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java
+++ b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java
@@ -27,31 +27,11 @@ class ReindexingMetrics {
void dump(Reindexing reindexing) {
reindexing.status().forEach((type, status) -> {
- Reindexing.State state = status.state();
metric.set("reindexing.progress",
status.progress().map(ProgressToken::percentFinished).map(percentage -> percentage * 1e-2)
.orElse(status.state() == SUCCESSFUL ? 1.0 : 0.0),
- metric.createContext(Map.of("clusterid", cluster,
- "documenttype", type.getName(),
- "state", toString(state))));
- // Set metric value to -1 for all states not currently active, so we only have one value >= 0 at any given time.
- for (Reindexing.State unset : EnumSet.complementOf(EnumSet.of(state)))
- metric.set("reindexing.progress",
- -1,
- metric.createContext(Map.of("clusterid", cluster,
- "documenttype", type.getName(),
- "state", toString(unset))));
+ metric.createContext(Map.of("clusterid", cluster, "documenttype", type.getName())));
});
}
- private static String toString(Reindexing.State state) {
- switch (state) {
- case READY: return "pending";
- case RUNNING: return "running";
- case FAILED: return "failed";
- case SUCCESSFUL: return "successful";
- default: throw new IllegalArgumentException("Unknown reindexing state '" + state + "'");
- }
- }
-
}
diff --git a/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java b/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java
index 1bab8f6ee27..dffe1dd43ef 100644
--- a/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java
+++ b/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java
@@ -121,21 +121,8 @@ class ReindexerTest {
assertEquals(reindexing, database.readReindexing("cluster"));
assertTrue(shutDown.get(), "Session was shut down");
assertEquals(Map.of("reindexing.progress", Map.of(Map.of("documenttype", "music",
- "clusterid", "cluster",
- "state", "successful"),
- 1.0,
- Map.of("documenttype", "music",
- "clusterid", "cluster",
- "state", "pending"),
- -1.0,
- Map.of("documenttype", "music",
- "clusterid", "cluster",
- "state", "failed"),
- -1.0,
- Map.of("documenttype", "music",
- "clusterid", "cluster",
- "state", "running"),
- -1.0)),
+ "clusterid", "cluster"),
+ 1.0)),
metric.metrics());
// One more reindexing, this time shut down before visit completes, but after progress is reported.
@@ -159,8 +146,7 @@ class ReindexerTest {
assertEquals(1.0, // new ProgressToken() is 100% done.
metric.metrics().get("reindexing.progress")
.get(Map.of("documenttype", "music",
- "clusterid", "cluster",
- "state", "pending")));
+ "clusterid", "cluster")));
// Reindexer is created without any ready document types, which means nothing should run.
new Reindexer(cluster, triggers(), database, ReindexerTest::failIfCalled, metric, clock).reindex();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index 50e6951f8be..38d7b6d3a2b 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -224,6 +224,9 @@ public class InternalStepRunner implements StepRunner {
// Retry certain failures for up to one hour.
Optional<RunStatus> result = startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1)))
? Optional.of(deploymentFailed) : Optional.empty();
+ if (result.isPresent())
+ logger.log(WARNING, "Deployment failed for one hour; giving up now!");
+
switch (e.code()) {
case CERTIFICATE_NOT_READY:
logger.log("No valid CA signed certificate for app available to config server");
@@ -424,10 +427,14 @@ public class InternalStepRunner implements StepRunner {
Optional<ServiceConvergence> services = controller.serviceRegistry().configServer().serviceConvergence(new DeploymentId(testerId, zone),
Optional.of(platform));
if (services.isEmpty()) {
- logger.log("Config status not currently available -- will retry.");
- return run.stepInfo(installTester).get().startTime().get().isBefore(controller.clock().instant().minus(Duration.ofMinutes(5)))
- ? Optional.of(error)
- : Optional.empty();
+ if (run.stepInfo(installTester).get().startTime().get().isBefore(controller.clock().instant().minus(Duration.ofMinutes(30)))) {
+ logger.log(WARNING, "Config status not available after 30 minutes; giving up!");
+ return Optional.of(error);
+ }
+ else {
+ logger.log("Config status not currently available -- will retry.");
+ return Optional.empty();
+ }
}
List<Node> nodes = controller.serviceRegistry().configServer().nodeRepository().list(zone,
NodeFilter.all()