diff options
author | Jon Marius Venstad <venstad@gmail.com> | 2021-03-17 11:05:04 +0100 |
---|---|---|
committer | Jon Marius Venstad <venstad@gmail.com> | 2021-03-17 11:05:04 +0100 |
commit | 76e960449534d63b7a593ae271ed7082d804d3d0 (patch) | |
tree | aa8ac05fa562e4b985a5cdbbeb9e362e8f3c853e /clustercontroller-reindexer | |
parent | 6699b03fcb353d0ba19910421b959bfc879294f8 (diff) |
Let reindexing resume from FAILED state after 10 minutes
Diffstat (limited to 'clustercontroller-reindexer')
3 files changed, 21 insertions, 6 deletions
diff --git a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java index 54240ebd81c..93b21c8166b 100644 --- a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java +++ b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexer.java @@ -42,6 +42,8 @@ public class Reindexer { private static final Logger log = Logger.getLogger(Reindexer.class.getName()); + static final Duration failureGrace = Duration.ofMinutes(10); + private final Cluster cluster; private final Map<DocumentType, Instant> ready; private final ReindexingCurator database; @@ -133,14 +135,15 @@ public class Reindexer { private void progress(DocumentType type, AtomicReference<Reindexing> reindexing, AtomicReference<Status> status) { switch (status.get().state()) { default: - log.log(WARNING, "Unknown reindexing state '" + status.get().state() + "'"); - case FAILED: - log.log(FINE, () -> "Not continuing reindexing of " + type + " due to previous failure"); + log.log(WARNING, "Unknown reindexing state '" + status.get().state() + "'—not continuing reindexing of " + type); case SUCCESSFUL: // Intentional fallthrough — all three are done states. return; case RUNNING: log.log(WARNING, "Unexpected state 'RUNNING' of reindexing of " + type); break; + case FAILED: + if (clock.instant().isBefore(status.get().endedAt().get().plus(failureGrace))) + return; case READY: status.updateAndGet(Status::running); } diff --git a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java index 51322c37a7d..1b5a685b69c 100644 --- a/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java +++ b/clustercontroller-reindexer/src/main/java/ai/vespa/reindexing/Reindexing.java @@ -91,8 +91,8 @@ public class Reindexing { /** Returns a copy of this, in state RUNNING. */ public Status running() { - if (state != State.READY) - throw new IllegalStateException("Current state must be READY when changing to RUNNING"); + if (state != State.READY && state != State.FAILED) + throw new IllegalStateException("Current state must be READY or FAILED when changing to RUNNING"); return new Status(startedAt, null, progress, State.RUNNING, null); } diff --git a/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java b/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java index cd9cf9f845d..ae2e757c662 100644 --- a/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java +++ b/clustercontroller-reindexer/src/test/java/ai/vespa/reindexing/ReindexerTest.java @@ -172,9 +172,21 @@ class ReindexerTest { assertEquals(reindexing, database.readReindexing("cluster")); assertTrue(shutDown.get(), "Session was shut down"); - // Document type is ignored in next run, as it has failed fatally. + // Document type is ignored in next run, as it has failed, and grace period is not yet over. + clock.advance(Reindexer.failureGrace.minusMillis(1)); new Reindexer(cluster, Map.of(music, Instant.ofEpochMilli(30)), database, ReindexerTest::failIfCalled, metric, clock).reindex(); assertEquals(reindexing, database.readReindexing("cluster")); + + // When failure grace period is over, reindexing resumes as usual. + clock.advance(Duration.ofMillis(1)); + shutDown.set(false); + new Reindexer(cluster, Map.of(music, Instant.ofEpochMilli(30)), database, parameters -> { + executor.execute(() -> parameters.getControlHandler().onDone(VisitorControlHandler.CompletionCode.SUCCESS, "OK")); + return () -> shutDown.set(true); + }, metric, clock).reindex(); + reindexing = reindexing.with(music, reindexing.status().get(music).running().successful(clock.instant())); + assertEquals(reindexing, database.readReindexing("cluster")); + assertTrue(shutDown.get(), "Session was shut down"); } } |