diff options
7 files changed, 52 insertions, 28 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java index 30f16acf77d..33339e4d480 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java @@ -67,10 +67,12 @@ import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.unfinished; import static com.yahoo.vespa.hosted.controller.deployment.Step.copyVespaLogs; import static com.yahoo.vespa.hosted.controller.deployment.Step.deactivateTester; +import static com.yahoo.vespa.hosted.controller.deployment.Step.deployTester; import static com.yahoo.vespa.hosted.controller.deployment.Step.endStagingSetup; import static com.yahoo.vespa.hosted.controller.deployment.Step.endTests; import static com.yahoo.vespa.hosted.controller.deployment.Step.report; import static java.time.temporal.ChronoUnit.SECONDS; +import static java.util.Comparator.comparing; import static java.util.Comparator.naturalOrder; import static java.util.function.Predicate.not; import static java.util.logging.Level.INFO; @@ -179,16 +181,28 @@ public class JobController { if (deployment.isEmpty() || deployment.get().at().isBefore(run.start())) return run; - Instant from = run.lastVespaLogTimestamp().isAfter(deployment.get().at()) ? run.lastVespaLogTimestamp() : deployment.get().at(); + Instant vespaFrom = run.lastVespaLogTimestamp().isAfter(deployment.get().at()) ? run.lastVespaLogTimestamp() : deployment.get().at(); List<LogEntry> log = LogEntry.parseVespaLog(controller.serviceRegistry().configServer() .getLogs(new DeploymentId(id.application(), zone), - Map.of("from", Long.toString(from.toEpochMilli()))), - from); + Map.of("from", Long.toString(vespaFrom.toEpochMilli()))), + vespaFrom); + vespaFrom = log.isEmpty() ? vespaFrom : log.get(log.size() - 1).at(); + + Instant testerFrom = run.lastTesterLogTimestamp().isAfter(deployment.get().at()) ? run.lastTesterLogTimestamp() : deployment.get().at(); + if (run.hasStep(deployTester) && run.versions().targetPlatform().isAfter(new Version("7.589.14"))) { // todo jonmv: remove + List<LogEntry> testerLog = LogEntry.parseVespaLog(controller.serviceRegistry().configServer() + .getLogs(new DeploymentId(id.tester().id(), zone), + Map.of("from", Long.toString(testerFrom.toEpochMilli()))), + testerFrom); + testerFrom = testerLog.isEmpty() ? testerFrom : testerLog.get(testerLog.size() - 1).at(); + + log = Stream.concat(log.stream(), testerLog.stream()).sorted(comparing(LogEntry::at)).collect(toUnmodifiableList()); + } if (log.isEmpty()) return run; logs.append(id.application(), id.type(), Step.copyVespaLogs, log); - return run.with(log.get(log.size() - 1).at()); + return run.with(vespaFrom, testerFrom); }); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java index 03cc6c6ba8d..7ee9fb2801b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java @@ -37,6 +37,7 @@ public class Run { private final RunStatus status; private final long lastTestRecord; private final Instant lastVespaLogTimestamp; + private final Instant lastTesterLogTimestamp; private final Optional<Instant> noNodesDownSince; private final Optional<ConvergenceSummary> convergenceSummary; private final Optional<X509Certificate> testerCertificate; @@ -45,8 +46,9 @@ public class Run { // For deserialisation only -- do not use! public Run(RunId id, Map<Step, StepInfo> steps, Versions versions, boolean isRedeployment, Instant start, Optional<Instant> end, - Optional<Instant> sleepUntil, RunStatus status, long lastTestRecord, Instant lastVespaLogTimestamp, Optional<Instant> noNodesDownSince, - Optional<ConvergenceSummary> convergenceSummary, Optional<X509Certificate> testerCertificate, boolean dryRun, Optional<String> reason) { + Optional<Instant> sleepUntil, RunStatus status, long lastTestRecord, Instant lastVespaLogTimestamp, + Instant lastTesterLogTimestamp, Optional<Instant> noNodesDownSince, Optional<ConvergenceSummary> convergenceSummary, + Optional<X509Certificate> testerCertificate, boolean dryRun, Optional<String> reason) { this.id = id; this.steps = Collections.unmodifiableMap(new EnumMap<>(steps)); this.versions = versions; @@ -57,6 +59,7 @@ public class Run { this.status = status; this.lastTestRecord = lastTestRecord; this.lastVespaLogTimestamp = lastVespaLogTimestamp; + this.lastTesterLogTimestamp = lastTesterLogTimestamp; this.noNodesDownSince = noNodesDownSince; this.convergenceSummary = convergenceSummary; this.testerCertificate = testerCertificate; @@ -67,8 +70,9 @@ public class Run { public static Run initial(RunId id, Versions versions, boolean isRedeployment, Instant now, JobProfile profile, Optional<String> triggeredBy) { EnumMap<Step, StepInfo> steps = new EnumMap<>(Step.class); profile.steps().forEach(step -> steps.put(step, StepInfo.initial(step))); - return new Run(id, steps, requireNonNull(versions), isRedeployment, requireNonNull(now), Optional.empty(), Optional.empty(), running, - -1, Instant.EPOCH, Optional.empty(), Optional.empty(), Optional.empty(), profile == JobProfile.developmentDryRun, triggeredBy); + return new Run(id, steps, requireNonNull(versions), isRedeployment, requireNonNull(now), Optional.empty(), + Optional.empty(), running, -1, Instant.EPOCH, Instant.EPOCH, Optional.empty(), Optional.empty(), + Optional.empty(), profile == JobProfile.developmentDryRun, triggeredBy); } /** Returns a new Run with the status of the given completed step set accordingly. */ @@ -83,7 +87,7 @@ public class Run { steps.put(step.get(), stepInfo.with(Step.Status.of(status))); RunStatus newStatus = hasFailed() || status == running ? this.status : status; return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, newStatus, lastTestRecord, - lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); + lastVespaLogTimestamp, lastTesterLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); } /** Returns a new Run with a new start time*/ @@ -98,19 +102,19 @@ public class Run { steps.put(step.get(), stepInfo.with(startTime)); return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, - noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); + lastTesterLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); } public Run finished(Instant now) { requireActive(); return new Run(id, steps, versions, isRedeployment, start, Optional.of(now), sleepUntil, status == running ? success : status, - lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, Optional.empty(), dryRun, reason); + lastTestRecord, lastVespaLogTimestamp, lastTesterLogTimestamp, noNodesDownSince, convergenceSummary, Optional.empty(), dryRun, reason); } public Run aborted() { requireActive(); return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, aborted, lastTestRecord, lastVespaLogTimestamp, - noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); + lastTesterLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); } public Run reset() { @@ -118,43 +122,43 @@ public class Run { Map<Step, StepInfo> reset = new EnumMap<>(steps); reset.replaceAll((step, __) -> StepInfo.initial(step)); return new Run(id, reset, versions, isRedeployment, start, end, sleepUntil, running, -1, lastVespaLogTimestamp, - Optional.empty(), Optional.empty(), testerCertificate, dryRun, reason); + lastTesterLogTimestamp, Optional.empty(), Optional.empty(), testerCertificate, dryRun, reason); } public Run with(long lastTestRecord) { requireActive(); return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, - noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); + lastTesterLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); } - public Run with(Instant lastVespaLogTimestamp) { + public Run with(Instant lastVespaLogTimestamp, Instant lastTesterLogTimestamp) { requireActive(); return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, - noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); + lastTesterLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); } public Run noNodesDownSince(Instant noNodesDownSince) { requireActive(); return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, - Optional.ofNullable(noNodesDownSince), convergenceSummary, testerCertificate, dryRun, reason); + lastTesterLogTimestamp, Optional.ofNullable(noNodesDownSince), convergenceSummary, testerCertificate, dryRun, reason); } public Run withSummary(ConvergenceSummary convergenceSummary) { requireActive(); return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, - noNodesDownSince, Optional.ofNullable(convergenceSummary), testerCertificate, dryRun, reason); + lastTesterLogTimestamp, noNodesDownSince, Optional.ofNullable(convergenceSummary), testerCertificate, dryRun, reason); } public Run with(X509Certificate testerCertificate) { requireActive(); return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, status, lastTestRecord, lastVespaLogTimestamp, - noNodesDownSince, convergenceSummary, Optional.of(testerCertificate), dryRun, reason); + lastTesterLogTimestamp, noNodesDownSince, convergenceSummary, Optional.of(testerCertificate), dryRun, reason); } public Run sleepingUntil(Instant instant) { requireActive(); return new Run(id, steps, versions, isRedeployment, start, end, Optional.of(instant), status, lastTestRecord, lastVespaLogTimestamp, - noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); + lastTesterLogTimestamp, noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); } /** Returns the id of this run. */ @@ -237,6 +241,11 @@ public class Run { return lastVespaLogTimestamp; } + /** Returns the timestamp of the last tester Vespa log record fetched and stored for this run. */ + public Instant lastTesterLogTimestamp() { + return lastTesterLogTimestamp; + } + /** Returns since when no nodes have been allowed to be down. */ public Optional<Instant> noNodesDownSince() { return noNodesDownSince; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java index 369699eb3a3..e954a1b6849 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java @@ -83,7 +83,6 @@ public class JobRunner extends ControllerMaintainer { jobs.abort(run.id(), "job timeout of " + jobTimeout + " reached"); advance(jobs.run(run.id()).get()); }); - else if (run.readySteps().isEmpty()) executors.execute(() -> finish(run.id())); else if (run.hasFailed() || run.sleepUntil().map(sleepUntil -> ! sleepUntil.isAfter(controller().clock().instant())).orElse(true)) diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java index dd28978d948..fa082db2f75 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.controller.persistence; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.SystemName; import com.yahoo.security.X509CertificateUtils; import com.yahoo.slime.ArrayTraverser; import com.yahoo.slime.Cursor; @@ -92,6 +91,7 @@ class RunSerializer { private static final String sourceField = "source"; private static final String lastTestRecordField = "lastTestRecord"; private static final String lastVespaLogTimestampField = "lastVespaLogTimestamp"; + private static final String lastTesterLogTimestampField = "lastTesterLogTimestamp"; private static final String noNodesDownSinceField = "noNodesDownSince"; private static final String convergenceSummaryField = "convergenceSummaryV2"; private static final String testerCertificateField = "testerCertificate"; @@ -138,6 +138,7 @@ class RunSerializer { runStatusOf(runObject.field(statusField).asString()), runObject.field(lastTestRecordField).asLong(), Instant.EPOCH.plus(runObject.field(lastVespaLogTimestampField).asLong(), ChronoUnit.MICROS), + Instant.EPOCH.plus(runObject.field(lastTesterLogTimestampField).asLong(), ChronoUnit.MICROS), SlimeUtils.optionalInstant(runObject.field(noNodesDownSinceField)), convergenceSummaryFrom(runObject.field(convergenceSummaryField)), Optional.of(runObject.field(testerCertificateField)) @@ -214,7 +215,8 @@ class RunSerializer { run.sleepUntil().ifPresent(end -> runObject.setLong(sleepingUntilField, end.toEpochMilli())); runObject.setString(statusField, valueOf(run.status())); runObject.setLong(lastTestRecordField, run.lastTestLogEntry()); - runObject.setLong(lastVespaLogTimestampField, Instant.EPOCH.until(run.lastVespaLogTimestamp(), ChronoUnit.MICROS)); + if (run.lastVespaLogTimestamp().isAfter(Instant.EPOCH)) runObject.setLong(lastVespaLogTimestampField, Instant.EPOCH.until(run.lastVespaLogTimestamp(), ChronoUnit.MICROS)); + if (run.lastTesterLogTimestamp().isAfter(Instant.EPOCH)) runObject.setLong(lastTesterLogTimestampField, Instant.EPOCH.until(run.lastTesterLogTimestamp(), ChronoUnit.MICROS)); run.noNodesDownSince().ifPresent(noNodesDownSince -> runObject.setLong(noNodesDownSinceField, noNodesDownSince.toEpochMilli())); run.convergenceSummary().ifPresent(convergenceSummary -> toSlime(convergenceSummary, runObject.setArray(convergenceSummaryField))); run.testerCertificate().ifPresent(certificate -> runObject.setString(testerCertificateField, X509CertificateUtils.toPem(certificate))); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java index 3ed5854327b..2eaaaf810a9 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java @@ -93,8 +93,7 @@ public class JobRunnerTest { ApplicationId id = appId.defaultInstance(); byte[] testPackageBytes = new byte[0]; jobs.submit(appId, Submission.basic(applicationPackage, testPackageBytes), 2); - - start(jobs, id, systemTest); + start(jobs, id, systemTest); try { start(jobs, id, systemTest); fail("Job is already running, so this should not be allowed!"); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java index fd0ea50e50b..fade713955f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java @@ -25,7 +25,6 @@ import java.time.Instant; import java.util.List; import java.util.Optional; -import static com.yahoo.config.provision.SystemName.main; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running; import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.failed; @@ -119,7 +118,7 @@ public class RunSerializerTest { run.steps()); run = run.with(1L << 50) - .with(Instant.now().truncatedTo(MILLIS)) + .with(Instant.now().truncatedTo(MILLIS), Instant.now().plusSeconds(2).truncatedTo(MILLIS)) .noNodesDownSince(Instant.now().truncatedTo(MILLIS)) .aborted() .finished(Instant.now().truncatedTo(MILLIS)); @@ -132,6 +131,8 @@ public class RunSerializerTest { assertEquals(run.end(), phoenix.end()); assertEquals(run.status(), phoenix.status()); assertEquals(run.lastTestLogEntry(), phoenix.lastTestLogEntry()); + assertEquals(run.lastVespaLogTimestamp(), phoenix.lastVespaLogTimestamp()); + assertEquals(run.lastTesterLogTimestamp(), phoenix.lastTesterLogTimestamp()); assertEquals(run.noNodesDownSince(), phoenix.noNodesDownSince()); assertEquals(run.testerCertificate(), phoenix.testerCertificate()); assertEquals(run.versions(), phoenix.versions()); diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java index 1edf8e4edbe..b038106843e 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java @@ -46,7 +46,7 @@ public abstract class Maintainer implements Runnable { Objects.requireNonNull(startedAt); Objects.requireNonNull(clusterHostnames); Duration initialDelay = staggeredDelay(interval, startedAt, HostName.getLocalhost(), clusterHostnames) - .plus(Duration.ofSeconds(30)); // Let the system stabilize before maintenance + .plus(Duration.ofSeconds(30)); // Let the system stabilize before maintenance service = new ScheduledThreadPoolExecutor(1, r -> new Thread(r, name() + "-worker")); service.scheduleAtFixedRate(this, initialDelay.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS); jobControl.started(name(), this); |