aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorJon Marius Venstad <venstad@gmail.com>2022-02-16 10:15:41 +0100
committerJon Marius Venstad <venstad@gmail.com>2022-02-16 13:43:34 +0100
commite6d239e24c5548aabc852282c13d94b46f63651b (patch)
tree00764c708af7c0d07b5eb28d4c397e8cbfe9e223 /controller-server
parente8446e07abeb197a8a3b165b0d68bcda36723f1c (diff)
Let tests lie dormant for a while (1/20 of time since deployment, clamped) after resetting
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java11
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java37
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java5
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java3
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java28
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json1
7 files changed, 66 insertions, 23 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index 1b9a4232783..532f19eee21 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -635,7 +635,8 @@ public class InternalStepRunner implements StepRunner {
}
private Optional<RunStatus> endTests(RunId id, DualLogger logger) {
- if (deployment(id.application(), id.type()).isEmpty()) {
+ Optional<Deployment> deployment = deployment(id.application(), id.type());
+ if (deployment.isEmpty()) {
logger.log(INFO, "Deployment expired before tests could complete.");
return Optional.of(error);
}
@@ -664,7 +665,9 @@ public class InternalStepRunner implements StepRunner {
controller.jobController().updateTestReport(id);
return Optional.of(testFailure);
case INCONCLUSIVE:
- logger.log("Tests were inconclusive, and will run again.");
+ long sleepMinutes = Math.max(15, Math.min(120, Duration.between(deployment.get().at(), controller.clock().instant()).toMinutes() / 20));
+ logger.log("Tests were inconclusive, and will run again in " + sleepMinutes + " minutes.");
+ controller.jobController().locked(id, run -> run.sleepingUntil(controller.clock().instant().plusSeconds(60 * sleepMinutes)));
return Optional.of(reset);
case ERROR:
logger.log(INFO, "Tester failed running its tests!");
@@ -737,8 +740,12 @@ public class InternalStepRunner implements StepRunner {
private Optional<RunStatus> report(RunId id, DualLogger logger) {
try {
controller.jobController().active(id).ifPresent(run -> {
+ if (run.status() == reset)
+ return;
+
if (run.hasFailed())
sendEmailNotification(run, logger);
+
updateConsoleNotification(run);
});
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
index c1f8092ef5c..07e9d8e0082 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
@@ -32,6 +32,7 @@ public class Run {
private final boolean isRedeployment;
private final Instant start;
private final Optional<Instant> end;
+ private final Optional<Instant> sleepUntil;
private final RunStatus status;
private final long lastTestRecord;
private final Instant lastVespaLogTimestamp;
@@ -42,7 +43,7 @@ public class Run {
// For deserialisation only -- do not use!
public Run(RunId id, Map<Step, StepInfo> steps, Versions versions, boolean isRedeployment, Instant start, Optional<Instant> end,
- RunStatus status, long lastTestRecord, Instant lastVespaLogTimestamp, Optional<Instant> noNodesDownSince,
+ Optional<Instant> sleepUntil, RunStatus status, long lastTestRecord, Instant lastVespaLogTimestamp, Optional<Instant> noNodesDownSince,
Optional<ConvergenceSummary> convergenceSummary, Optional<X509Certificate> testerCertificate, boolean dryRun) {
this.id = id;
this.steps = Collections.unmodifiableMap(new EnumMap<>(steps));
@@ -50,6 +51,7 @@ public class Run {
this.isRedeployment = isRedeployment;
this.start = start;
this.end = end;
+ this.sleepUntil = sleepUntil;
this.status = status;
this.lastTestRecord = lastTestRecord;
this.lastVespaLogTimestamp = lastVespaLogTimestamp;
@@ -62,7 +64,7 @@ public class Run {
public static Run initial(RunId id, Versions versions, boolean isRedeployment, Instant now, JobProfile profile) {
EnumMap<Step, StepInfo> steps = new EnumMap<>(Step.class);
profile.steps().forEach(step -> steps.put(step, StepInfo.initial(step)));
- return new Run(id, steps, requireNonNull(versions), isRedeployment, requireNonNull(now), Optional.empty(), running,
+ return new Run(id, steps, requireNonNull(versions), isRedeployment, requireNonNull(now), Optional.empty(), Optional.empty(), running,
-1, Instant.EPOCH, Optional.empty(), Optional.empty(), Optional.empty(), profile == JobProfile.developmentDryRun);
}
@@ -76,7 +78,7 @@ public class Run {
EnumMap<Step, StepInfo> steps = new EnumMap<>(this.steps);
steps.put(step.get(), stepInfo.with(Step.Status.of(status)));
- return new Run(id, steps, versions, isRedeployment, start, end, this.status == running ? status : this.status,
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, this.status == running ? status : this.status,
lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun);
}
@@ -91,19 +93,19 @@ public class Run {
EnumMap<Step, StepInfo> steps = new EnumMap<>(this.steps);
steps.put(step.get(), stepInfo.with(startTime));
- return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp,
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp,
noNodesDownSince, convergenceSummary, testerCertificate, dryRun);
}
public Run finished(Instant now) {
requireActive();
- return new Run(id, steps, versions, isRedeployment, start, Optional.of(now), status == running ? success : status,
+ return new Run(id, steps, versions, isRedeployment, start, Optional.of(now), sleepUntil, status == running ? success : status,
lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, Optional.empty(), dryRun);
}
public Run aborted() {
requireActive();
- return new Run(id, steps, versions, isRedeployment, start, end, aborted, lastTestRecord, lastVespaLogTimestamp,
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, aborted, lastTestRecord, lastVespaLogTimestamp,
noNodesDownSince, convergenceSummary, testerCertificate, dryRun);
}
@@ -111,40 +113,46 @@ public class Run {
requireActive();
Map<Step, StepInfo> reset = new EnumMap<>(steps);
reset.replaceAll((step, __) -> StepInfo.initial(step));
- return new Run(id, reset, versions, isRedeployment, start, Optional.empty(), running, lastTestRecord, lastVespaLogTimestamp,
+ return new Run(id, reset, versions, isRedeployment, start, end, sleepUntil, running, lastTestRecord, lastVespaLogTimestamp,
Optional.empty(), Optional.empty(), testerCertificate, dryRun);
}
public Run with(long lastTestRecord) {
requireActive();
- return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp,
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp,
noNodesDownSince, convergenceSummary, testerCertificate, dryRun);
}
public Run with(Instant lastVespaLogTimestamp) {
requireActive();
- return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp,
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp,
noNodesDownSince, convergenceSummary, testerCertificate, dryRun);
}
public Run noNodesDownSince(Instant noNodesDownSince) {
requireActive();
- return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp,
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp,
Optional.ofNullable(noNodesDownSince), convergenceSummary, testerCertificate, dryRun);
}
public Run withSummary(ConvergenceSummary convergenceSummary) {
requireActive();
- return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp,
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp,
noNodesDownSince, Optional.ofNullable(convergenceSummary), testerCertificate, dryRun);
}
public Run with(X509Certificate testerCertificate) {
requireActive();
- return new Run(id, steps, versions, isRedeployment, start, end, status, lastTestRecord, lastVespaLogTimestamp,
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp,
noNodesDownSince, convergenceSummary, Optional.of(testerCertificate), dryRun);
}
+ public Run sleepingUntil(Instant instant) {
+ requireActive();
+ return new Run(id, steps, versions, isRedeployment, start, end, Optional.of(instant), status, lastTestRecord, lastVespaLogTimestamp,
+ noNodesDownSince, convergenceSummary, testerCertificate, dryRun);
+ }
+
/** Returns the id of this run. */
public RunId id() {
return id;
@@ -193,6 +201,11 @@ public class Run {
return end;
}
+ /** Returns the instant until which this should sleep. */
+ public Optional<Instant> sleepUntil() {
+ return sleepUntil;
+ }
+
/** Returns whether the run has failed, and should switch to its run-always steps. */
public boolean hasFailed() {
return status != running && status != success;
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java
index 71832352d4d..24723f84897 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepInfo.java
@@ -6,12 +6,12 @@ import java.util.Objects;
import java.util.Optional;
/**
- * Information about a step.
+ * Information about a step. Immutable.
*
* @author hakonhall
*/
-// @Immutable
public class StepInfo {
+
private final Step step;
private final Step.Status status;
private final Optional<Instant> startTime;
@@ -57,4 +57,5 @@ public class StepInfo {
public int hashCode() {
return Objects.hash(step, status, startTime);
}
+
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
index 85d750d267c..3184449e48b 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
@@ -78,7 +78,7 @@ public class JobRunner extends ControllerMaintainer {
/** Advances each of the ready steps for the given run, or marks it as finished, and stashes it. Public for testing. */
public void advance(Run run) {
if ( ! run.hasFailed()
- && controller().clock().instant().isAfter(run.start().plus(jobTimeout)))
+ && controller().clock().instant().isAfter(run.sleepUntil().orElse(run.start()).plus(jobTimeout)))
executors.execute(() -> {
jobs.abort(run.id());
advance(jobs.run(run.id()).get());
@@ -86,7 +86,7 @@ public class JobRunner extends ControllerMaintainer {
else if (run.readySteps().isEmpty())
executors.execute(() -> finish(run.id()));
- else
+ else if (run.hasFailed() || run.sleepUntil().map(sleepUntil -> ! sleepUntil.isAfter(controller().clock().instant())).orElse(true))
run.readySteps().forEach(step -> executors.execute(() -> advance(run.id(), step)));
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
index 75760cd5d00..800d5e2750b 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
@@ -83,6 +83,7 @@ class RunSerializer {
private static final String numberField = "number";
private static final String startField = "start";
private static final String endField = "end";
+ private static final String sleepingUntilField = "sleepingUntil";
private static final String statusField = "status";
private static final String versionsField = "versions";
private static final String isRedeploymentField = "isRedeployment";
@@ -140,6 +141,7 @@ class RunSerializer {
runObject.field(isRedeploymentField).asBool(),
SlimeUtils.instant(runObject.field(startField)),
SlimeUtils.optionalInstant(runObject.field(endField)),
+ SlimeUtils.optionalInstant(runObject.field(sleepingUntilField)),
runStatusOf(runObject.field(statusField).asString()),
runObject.field(lastTestRecordField).asLong(),
Instant.EPOCH.plus(runObject.field(lastVespaLogTimestampField).asLong(), ChronoUnit.MICROS),
@@ -229,6 +231,7 @@ class RunSerializer {
runObject.setLong(numberField, run.id().number());
runObject.setLong(startField, run.start().toEpochMilli());
run.end().ifPresent(end -> runObject.setLong(endField, end.toEpochMilli()));
+ run.sleepUntil().ifPresent(end -> runObject.setLong(sleepingUntilField, end.toEpochMilli()));
runObject.setString(statusField, valueOf(run.status()));
runObject.setLong(lastTestRecordField, run.lastTestLogEntry());
runObject.setLong(lastVespaLogTimestampField, Instant.EPOCH.until(run.lastVespaLogTimestamp(), ChronoUnit.MICROS));
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
index 9b96eb900c0..5b6a4621e52 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
@@ -28,6 +28,7 @@ import com.yahoo.vespa.hosted.controller.application.SystemApplication;
import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage;
import com.yahoo.vespa.hosted.controller.config.ControllerConfig;
import com.yahoo.vespa.hosted.controller.integration.ZoneApiMock;
+import com.yahoo.vespa.hosted.controller.maintenance.JobRunner;
import org.junit.Before;
import org.junit.Test;
@@ -344,21 +345,38 @@ public class InternalStepRunnerTest {
tester.cloud().add(new LogEntry(0, Instant.ofEpochMilli(123), info, "Not enough data!"));
tester.cloud().set(TesterCloud.Status.INCONCLUSIVE);
- long lastId = tester.jobs().details(id).get().lastId().getAsLong();
+ long lastId1 = tester.jobs().details(id).get().lastId().getAsLong();
+ Instant instant1 = tester.clock().instant();
tester.runner().run();
assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.endTests));
assertEquals(running, tester.jobs().run(id).get().status());
- // TODO: Add some sleeping, so this isn't immediately re-run.
+ // Test sleeps for a while.
+ tester.runner().run();
assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.deployTester));
+ tester.clock().advance(Duration.ofSeconds(899));
+ tester.runner().run();
+ assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.deployTester));
+
+ tester.clock().advance(JobRunner.jobTimeout);
+ var testZone = JobType.systemTest.zone(tester.controller().system());
+ tester.runner().run();
+ app.flushDnsUpdates();
+ tester.configServer().convergeServices(app.instanceId(), testZone);
+ tester.configServer().convergeServices(app.testerId().id(), testZone);
+ tester.runner().run();
+ assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.endTests));
+ assertTrue(tester.jobs().run(id).get().steps().get(Step.endTests).startTime().isPresent());
+
tester.cloud().set(TesterCloud.Status.SUCCESS);
+ long lastId2 = tester.jobs().details(id).get().lastId().getAsLong();
tester.runner().run();
assertEquals(success, tester.jobs().run(id).get().status());
assertTestLogEntries(id, Step.endTests,
- new LogEntry(lastId + 1, Instant.ofEpochMilli(123), info, "Not enough data!"),
- new LogEntry(lastId + 2, tester.clock().instant(), info, "Tests were inconclusive, and will run again."),
- new LogEntry(lastId + 3, tester.clock().instant(), info, "Tests completed successfully."));
+ new LogEntry(lastId1 + 1, Instant.ofEpochMilli(123), info, "Not enough data!"),
+ new LogEntry(lastId1 + 2, instant1, info, "Tests were inconclusive, and will run again in 15 minutes."),
+ new LogEntry(lastId2 + 1, tester.clock().instant(), info, "Tests completed successfully."));
}
@Test
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json
index 7b9131a38dd..54cde2bacef 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json
@@ -4,6 +4,7 @@
"type": "production-us-east-3",
"number": 112358,
"start": 1196676930000,
+ "sleepUntil": 1196676930100,
"status": "running",
"lastTestRecord": 3,
"lastVespaLogTimestamp": 1196676930000432,