diff options
author | Jon Marius Venstad <venstad@gmail.com> | 2022-02-16 10:15:41 +0100 |
---|---|---|
committer | Jon Marius Venstad <venstad@gmail.com> | 2022-02-16 13:43:34 +0100 |
commit | e6d239e24c5548aabc852282c13d94b46f63651b (patch) | |
tree | 00764c708af7c0d07b5eb28d4c397e8cbfe9e223 /controller-server | |
parent | e8446e07abeb197a8a3b165b0d68bcda36723f1c (diff) |
Let tests lie dormant for a while (1/20 of time since deployment, clamped) after resetting
Diffstat (limited to 'controller-server')
7 files changed, 66 insertions, 23 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 1b9a4232783..532f19eee21 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -635,7 +635,8 @@ public class InternalStepRunner implements StepRunner { } private Optional<RunStatus> endTests(RunId id, DualLogger logger) { - if (deployment(id.application(), id.type()).isEmpty()) { + Optional<Deployment> deployment = deployment(id.application(), id.type()); + if (deployment.isEmpty()) { logger.log(INFO, "Deployment expired before tests could complete."); return Optional.of(error); } @@ -664,7 +665,9 @@ public class InternalStepRunner implements StepRunner { controller.jobController().updateTestReport(id); return Optional.of(testFailure); case INCONCLUSIVE: - logger.log("Tests were inconclusive, and will run again."); + long sleepMinutes = Math.max(15, Math.min(120, Duration.between(deployment.get().at(), controller.clock().instant()).toMinutes() / 20)); + logger.log("Tests were inconclusive, and will run again in " + sleepMinutes + " minutes."); + controller.jobController().locked(id, run -> run.sleepingUntil(controller.clock().instant().plusSeconds(60 * sleepMinutes))); return Optional.of(reset); case ERROR: logger.log(INFO, "Tester failed running its tests!"); @@ -737,8 +740,12 @@ public class InternalStepRunner implements StepRunner { private Optional<RunStatus> report(RunId id, DualLogger logger) { try { controller.jobController().active(id).ifPresent(run -> { + if (run.status() == reset) + return; + if (run.hasFailed()) sendEmailNotification(run, logger); + updateConsoleNotification(run); }); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java index c1f8092ef5c..07e9d8e0082 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java @@ -32,6 +32,7 @@ public class Run { private final boolean isRedeployment; private final Instant start; private final Optional<Instant> end; + private final Optional<Instant> sleepUntil; private final RunStatus status; private final long lastTestRecord; private final Instant lastVespaLogTimestamp; @@ -42,7 +43,7 @@ public class Run { // For deserialisation only -- do not use! public Run(RunId id, Map<Step, StepInfo> steps, Versions versions, boolean isRedeployment, Instant start, Optional<Instant> end, - RunStatus status, long lastTestRecord, Instant lastVespaLogTimestamp, Optional<Instant> noNodesDownSince, + Optional<Instant> sleepUntil, RunStatus status, long lastTestRecord, Instant lastVespaLogTimestamp, Optional<Instant> noNodesDownSince, Optional<ConvergenceSummary> convergenceSummary, Optional<X509Certificate> testerCertificate, boolean dryRun) { this.id = id; this.steps = Collections.unmodifiableMap(new EnumMap<>(steps)); @@ -50,6 +51,7 @@ public class Run { this.isRedeployment = isRedeployment; this.start = start; this.end = end; + this.sleepUntil = sleepUntil; this.status = status; this.lastTestRecord = lastTestRecord; this.lastVespaLogTimestamp = lastVespaLogTimestamp; @@ -62,7 +64,7 @@ public class Run { public static Run initial(RunId id, Versions versions, boolean isRedeployment, Instant now, JobProfile profile) { EnumMap<Step, StepInfo> steps = new EnumMap<>(Step.class); profile.steps().forEach(step -> steps.put(step, StepInfo.initial(step))); - return new Run(id, steps, requireNonNull(versions), isRedeployment, requireNonNull(now), Optional.empty(), running, + return new Run(id, steps, requireNonNull(versions), isRedeployment, requireNonNull(now), Optional.empty(), Optional.empty(), running, -1, Instant.EPOCH, Optional.empty(), Optional.empty(), Optional.empty(), profile == JobProfile.developmentDryRun); } @@ -76,7 +78,7 @@ public class Run { EnumMap<Step, StepInfo> steps = new EnumMap<>(this.steps); steps.put(step.get(), stepInfo.with(Step.Status.of(status))); - return new Run(id, steps, versions, isRedeployment, start, end, this.status == running ? status : this.status, + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, this.status == running ? status : this.status, lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun); } @@ -91,19 +93,19 @@ public class Run { EnumMap<Step, StepInfo> steps = new EnumMap<>(this.steps); steps.put(step.get(), stepInfo.with(startTime)); - return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp, + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun); } public Run finished(Instant now) { requireActive(); - return new Run(id, steps, versions, isRedeployment, start, Optional.of(now), status == running ? success : status, + return new Run(id, steps, versions, isRedeployment, start, Optional.of(now), sleepUntil, status == running ? success : status, lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, Optional.empty(), dryRun); } public Run aborted() { requireActive(); - return new Run(id, steps, versions, isRedeployment, start, end, aborted, lastTestRecord, lastVespaLogTimestamp, + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, aborted, lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun); } @@ -111,40 +113,46 @@ public class Run { requireActive(); Map<Step, StepInfo> reset = new EnumMap<>(steps); reset.replaceAll((step, __) -> StepInfo.initial(step)); - return new Run(id, reset, versions, isRedeployment, start, Optional.empty(), running, lastTestRecord, lastVespaLogTimestamp, + return new Run(id, reset, versions, isRedeployment, start, end, sleepUntil, running, lastTestRecord, lastVespaLogTimestamp, Optional.empty(), Optional.empty(), testerCertificate, dryRun); } public Run with(long lastTestRecord) { requireActive(); - return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp, + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun); } public Run with(Instant lastVespaLogTimestamp) { requireActive(); - return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp, + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun); } public Run noNodesDownSince(Instant noNodesDownSince) { requireActive(); - return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp, + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, Optional.ofNullable(noNodesDownSince), convergenceSummary, testerCertificate, dryRun); } public Run withSummary(ConvergenceSummary convergenceSummary) { requireActive(); - return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp, + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, Optional.ofNullable(convergenceSummary), testerCertificate, dryRun); } public Run with(X509Certificate testerCertificate) { requireActive(); - return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp, + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, Optional.of(testerCertificate), dryRun); } + public Run sleepingUntil(Instant instant) { + requireActive(); + return new Run(id, steps, versions, isRedeployment, start, end, Optional.of(instant), status, lastTestRecord, lastVespaLogTimestamp, + noNodesDownSince, convergenceSummary, testerCertificate, dryRun); + } + /** Returns the id of this run. */ public RunId id() { return id; @@ -193,6 +201,11 @@ public class Run { return end; } + /** Returns the instant until which this should sleep. */ + public Optional<Instant> sleepUntil() { + return sleepUntil; + } + /** Returns whether the run has failed, and should switch to its run-always steps. */ public boolean hasFailed() { return status != running && status != success; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java index 71832352d4d..24723f84897 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java @@ -6,12 +6,12 @@ import java.util.Objects; import java.util.Optional; /** - * Information about a step. + * Information about a step. Immutable. * * @author hakonhall */ -// @Immutable public class StepInfo { + private final Step step; private final Step.Status status; private final Optional<Instant> startTime; @@ -57,4 +57,5 @@ public class StepInfo { public int hashCode() { return Objects.hash(step, status, startTime); } + } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java index 85d750d267c..3184449e48b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java @@ -78,7 +78,7 @@ public class JobRunner extends ControllerMaintainer { /** Advances each of the ready steps for the given run, or marks it as finished, and stashes it. Public for testing. */ public void advance(Run run) { if ( ! run.hasFailed() - && controller().clock().instant().isAfter(run.start().plus(jobTimeout))) + && controller().clock().instant().isAfter(run.sleepUntil().orElse(run.start()).plus(jobTimeout))) executors.execute(() -> { jobs.abort(run.id()); advance(jobs.run(run.id()).get()); @@ -86,7 +86,7 @@ public class JobRunner extends ControllerMaintainer { else if (run.readySteps().isEmpty()) executors.execute(() -> finish(run.id())); - else + else if (run.hasFailed() || run.sleepUntil().map(sleepUntil -> ! sleepUntil.isAfter(controller().clock().instant())).orElse(true)) run.readySteps().forEach(step -> executors.execute(() -> advance(run.id(), step))); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java index 75760cd5d00..800d5e2750b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java @@ -83,6 +83,7 @@ class RunSerializer { private static final String numberField = "number"; private static final String startField = "start"; private static final String endField = "end"; + private static final String sleepingUntilField = "sleepingUntil"; private static final String statusField = "status"; private static final String versionsField = "versions"; private static final String isRedeploymentField = "isRedeployment"; @@ -140,6 +141,7 @@ class RunSerializer { runObject.field(isRedeploymentField).asBool(), SlimeUtils.instant(runObject.field(startField)), SlimeUtils.optionalInstant(runObject.field(endField)), + SlimeUtils.optionalInstant(runObject.field(sleepingUntilField)), runStatusOf(runObject.field(statusField).asString()), runObject.field(lastTestRecordField).asLong(), Instant.EPOCH.plus(runObject.field(lastVespaLogTimestampField).asLong(), ChronoUnit.MICROS), @@ -229,6 +231,7 @@ class RunSerializer { runObject.setLong(numberField, run.id().number()); runObject.setLong(startField, run.start().toEpochMilli()); run.end().ifPresent(end -> runObject.setLong(endField, end.toEpochMilli())); + run.sleepUntil().ifPresent(end -> runObject.setLong(sleepingUntilField, end.toEpochMilli())); runObject.setString(statusField, valueOf(run.status())); runObject.setLong(lastTestRecordField, run.lastTestLogEntry()); runObject.setLong(lastVespaLogTimestampField, Instant.EPOCH.until(run.lastVespaLogTimestamp(), ChronoUnit.MICROS)); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java index 9b96eb900c0..5b6a4621e52 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java @@ -28,6 +28,7 @@ import com.yahoo.vespa.hosted.controller.application.SystemApplication; import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage; import com.yahoo.vespa.hosted.controller.config.ControllerConfig; import com.yahoo.vespa.hosted.controller.integration.ZoneApiMock; +import com.yahoo.vespa.hosted.controller.maintenance.JobRunner; import org.junit.Before; import org.junit.Test; @@ -344,21 +345,38 @@ public class InternalStepRunnerTest { tester.cloud().add(new LogEntry(0, Instant.ofEpochMilli(123), info, "Not enough data!")); tester.cloud().set(TesterCloud.Status.INCONCLUSIVE); - long lastId = tester.jobs().details(id).get().lastId().getAsLong(); + long lastId1 = tester.jobs().details(id).get().lastId().getAsLong(); + Instant instant1 = tester.clock().instant(); tester.runner().run(); assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.endTests)); assertEquals(running, tester.jobs().run(id).get().status()); - // TODO: Add some sleeping, so this isn't immediately re-run. + // Test sleeps for a while. + tester.runner().run(); assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.deployTester)); + tester.clock().advance(Duration.ofSeconds(899)); + tester.runner().run(); + assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.deployTester)); + + tester.clock().advance(JobRunner.jobTimeout); + var testZone = JobType.systemTest.zone(tester.controller().system()); + tester.runner().run(); + app.flushDnsUpdates(); + tester.configServer().convergeServices(app.instanceId(), testZone); + tester.configServer().convergeServices(app.testerId().id(), testZone); + tester.runner().run(); + assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.endTests)); + assertTrue(tester.jobs().run(id).get().steps().get(Step.endTests).startTime().isPresent()); + tester.cloud().set(TesterCloud.Status.SUCCESS); + long lastId2 = tester.jobs().details(id).get().lastId().getAsLong(); tester.runner().run(); assertEquals(success, tester.jobs().run(id).get().status()); assertTestLogEntries(id, Step.endTests, - new LogEntry(lastId + 1, Instant.ofEpochMilli(123), info, "Not enough data!"), - new LogEntry(lastId + 2, tester.clock().instant(), info, "Tests were inconclusive, and will run again."), - new LogEntry(lastId + 3, tester.clock().instant(), info, "Tests completed successfully.")); + new LogEntry(lastId1 + 1, Instant.ofEpochMilli(123), info, "Not enough data!"), + new LogEntry(lastId1 + 2, instant1, info, "Tests were inconclusive, and will run again in 15 minutes."), + new LogEntry(lastId2 + 1, tester.clock().instant(), info, "Tests completed successfully.")); } @Test diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json index 7b9131a38dd..54cde2bacef 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json @@ -4,6 +4,7 @@ "type": "production-us-east-3", "number": 112358, "start": 1196676930000, + "sleepUntil": 1196676930100, "status": "running", "lastTestRecord": 3, "lastVespaLogTimestamp": 1196676930000432, |