diff options
3 files changed, 15 insertions, 42 deletions
diff --git a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java index 8a48e4ce272..a1aa5287d2f 100644 --- a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java +++ b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/ReindexingMetrics.java @@ -27,31 +27,11 @@ class ReindexingMetrics { void dump(Reindexing reindexing) { reindexing.status().forEach((type, status) -> { - Reindexing.State state = status.state(); metric.set("reindexing.progress", status.progress().map(ProgressToken::percentFinished).map(percentage -> percentage * 1e-2) .orElse(status.state() == SUCCESSFUL ? 1.0 : 0.0), - metric.createContext(Map.of("clusterid", cluster, - "documenttype", type.getName(), - "state", toString(state)))); - // Set metric value to -1 for all states not currently active, so we only have one value >= 0 at any given time. - for (Reindexing.State unset : EnumSet.complementOf(EnumSet.of(state))) - metric.set("reindexing.progress", - -1, - metric.createContext(Map.of("clusterid", cluster, - "documenttype", type.getName(), - "state", toString(unset)))); + metric.createContext(Map.of("clusterid", cluster, "documenttype", type.getName()))); }); } - private static String toString(Reindexing.State state) { - switch (state) { - case READY: return "pending"; - case RUNNING: return "running"; - case FAILED: return "failed"; - case SUCCESSFUL: return "successful"; - default: throw new IllegalArgumentException("Unknown reindexing state '" + state + "'"); - } - } - } diff --git a/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java b/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java index 1bab8f6ee27..dffe1dd43ef 100644 --- a/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java +++ b/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java @@ -121,21 +121,8 @@ class ReindexerTest { assertEquals(reindexing, database.readReindexing("cluster")); assertTrue(shutDown.get(), "Session was shut down"); assertEquals(Map.of("reindexing.progress", Map.of(Map.of("documenttype", "music", - "clusterid", "cluster", - "state", "successful"), - 1.0, - Map.of("documenttype", "music", - "clusterid", "cluster", - "state", "pending"), - -1.0, - Map.of("documenttype", "music", - "clusterid", "cluster", - "state", "failed"), - -1.0, - Map.of("documenttype", "music", - "clusterid", "cluster", - "state", "running"), - -1.0)), + "clusterid", "cluster"), + 1.0)), metric.metrics()); // One more reindexing, this time shut down before visit completes, but after progress is reported. @@ -159,8 +146,7 @@ class ReindexerTest { assertEquals(1.0, // new ProgressToken() is 100% done. metric.metrics().get("reindexing.progress") .get(Map.of("documenttype", "music", - "clusterid", "cluster", - "state", "pending"))); + "clusterid", "cluster"))); // Reindexer is created without any ready document types, which means nothing should run. new Reindexer(cluster, triggers(), database, ReindexerTest::failIfCalled, metric, clock).reindex(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 50e6951f8be..38d7b6d3a2b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -224,6 +224,9 @@ public class InternalStepRunner implements StepRunner { // Retry certain failures for up to one hour. Optional<RunStatus> result = startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1))) ? Optional.of(deploymentFailed) : Optional.empty(); + if (result.isPresent()) + logger.log(WARNING, "Deployment failed for one hour; giving up now!"); + switch (e.code()) { case CERTIFICATE_NOT_READY: logger.log("No valid CA signed certificate for app available to config server"); @@ -424,10 +427,14 @@ public class InternalStepRunner implements StepRunner { Optional<ServiceConvergence> services = controller.serviceRegistry().configServer().serviceConvergence(new DeploymentId(testerId, zone), Optional.of(platform)); if (services.isEmpty()) { - logger.log("Config status not currently available -- will retry."); - return run.stepInfo(installTester).get().startTime().get().isBefore(controller.clock().instant().minus(Duration.ofMinutes(5))) - ? Optional.of(error) - : Optional.empty(); + if (run.stepInfo(installTester).get().startTime().get().isBefore(controller.clock().instant().minus(Duration.ofMinutes(30)))) { + logger.log(WARNING, "Config status not available after 30 minutes; giving up!"); + return Optional.of(error); + } + else { + logger.log("Config status not currently available -- will retry."); + return Optional.empty(); + } } List<Node> nodes = controller.serviceRegistry().configServer().nodeRepository().list(zone, NodeFilter.all() |