summaryrefslogtreecommitdiffstats
path: root/clustercontroller-reindexer
diff options
context:
space:
mode:
authorJon Marius Venstad <venstad@gmail.com>2021-03-17 11:05:04 +0100
committerJon Marius Venstad <venstad@gmail.com>2021-03-17 11:05:04 +0100
commit76e960449534d63b7a593ae271ed7082d804d3d0 (patch)
treeaa8ac05fa562e4b985a5cdbbeb9e362e8f3c853e /clustercontroller-reindexer
parent6699b03fcb353d0ba19910421b959bfc879294f8 (diff)
Let reindexing resume from FAILED state after 10 minutes
Diffstat (limited to 'clustercontroller-reindexer')
-rw-r--r--clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java9
-rw-r--r--clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java4
-rw-r--r--clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java14
3 files changed, 21 insertions, 6 deletions
diff --git a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java
index 54240ebd81c..93b21c8166b 100644
--- a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java
+++ b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java
@@ -42,6 +42,8 @@ public class Reindexer {
private static final Logger log = Logger.getLogger(Reindexer.class.getName());
+ static final Duration failureGrace = Duration.ofMinutes(10);
+
private final Cluster cluster;
private final Map<DocumentType, Instant> ready;
private final ReindexingCurator database;
@@ -133,14 +135,15 @@ public class Reindexer {
private void progress(DocumentType type, AtomicReference<Reindexing> reindexing, AtomicReference<Status> status) {
switch (status.get().state()) {
default:
- log.log(WARNING, "Unknown reindexing state '" + status.get().state() + "'");
- case FAILED:
- log.log(FINE, () -> "Not continuing reindexing of " + type + " due to previous failure");
+ log.log(WARNING, "Unknown reindexing state '" + status.get().state() + "'—not continuing reindexing of " + type);
case SUCCESSFUL: // Intentional fallthrough — all three are done states.
return;
case RUNNING:
log.log(WARNING, "Unexpected state 'RUNNING' of reindexing of " + type);
break;
+ case FAILED:
+ if (clock.instant().isBefore(status.get().endedAt().get().plus(failureGrace)))
+ return;
case READY:
status.updateAndGet(Status::running);
}
diff --git a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java
index 51322c37a7d..1b5a685b69c 100644
--- a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java
+++ b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java
@@ -91,8 +91,8 @@ public class Reindexing {
/** Returns a copy of this, in state RUNNING. */
public Status running() {
- if (state != State.READY)
- throw new IllegalStateException("Current state must be READY when changing to RUNNING");
+ if (state != State.READY && state != State.FAILED)
+ throw new IllegalStateException("Current state must be READY or FAILED when changing to RUNNING");
return new Status(startedAt, null, progress, State.RUNNING, null);
}
diff --git a/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java b/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java
index cd9cf9f845d..ae2e757c662 100644
--- a/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java
+++ b/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java
@@ -172,9 +172,21 @@ class ReindexerTest {
assertEquals(reindexing, database.readReindexing("cluster"));
assertTrue(shutDown.get(), "Session was shut down");
- // Document type is ignored in next run, as it has failed fatally.
+ // Document type is ignored in next run, as it has failed, and grace period is not yet over.
+ clock.advance(Reindexer.failureGrace.minusMillis(1));
new Reindexer(cluster, Map.of(music, Instant.ofEpochMilli(30)), database, ReindexerTest::failIfCalled, metric, clock).reindex();
assertEquals(reindexing, database.readReindexing("cluster"));
+
+ // When failure grace period is over, reindexing resumes as usual.
+ clock.advance(Duration.ofMillis(1));
+ shutDown.set(false);
+ new Reindexer(cluster, Map.of(music, Instant.ofEpochMilli(30)), database, parameters -> {
+ executor.execute(() -> parameters.getControlHandler().onDone(VisitorControlHandler.CompletionCode.SUCCESS, "OK"));
+ return () -> shutDown.set(true);
+ }, metric, clock).reindex();
+ reindexing = reindexing.with(music, reindexing.status().get(music).running().successful(clock.instant()));
+ assertEquals(reindexing, database.readReindexing("cluster"));
+ assertTrue(shutDown.get(), "Session was shut down");
}
}