summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorJon Marius Venstad <venstad@gmail.com>2019-11-06 13:46:49 +0100
committerJon Marius Venstad <venstad@gmail.com>2019-11-07 08:41:29 +0100
commite072011cd0850d634aa9ca9e58c6bff9d3301b6e (patch)
treeb5a772154c972601d3cdaec8640d03c87bcc724d /controller-server
parent1c281808c40a1a65338f397fc086d4a3922ed2f5 (diff)
Let JobController provide lastSuccess, firstFailing and lastCompleted
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java46
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java5
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java69
4 files changed, 108 insertions, 15 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
index 3deca255bad..41266e761a9 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
@@ -38,9 +38,11 @@ import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.NavigableMap;
import java.util.Optional;
import java.util.Set;
import java.util.SortedMap;
+import java.util.TreeMap;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
@@ -215,10 +217,15 @@ public class JobController {
}
/** Returns an immutable map of all known runs for the given application and job type. */
- public Map<RunId, Run> runs(ApplicationId id, JobType type) {
- SortedMap<RunId, Run> runs = curator.readHistoricRuns(id, type);
+ public NavigableMap<RunId, Run> runs(JobId id) {
+ return runs(id.application(), id.type());
+ }
+
+ /** Returns an immutable map of all known runs for the given application and job type. */
+ public NavigableMap<RunId, Run> runs(ApplicationId id, JobType type) {
+ NavigableMap<RunId, Run> runs = curator.readHistoricRuns(id, type);
last(id, type).ifPresent(run -> runs.put(run.id(), run));
- return ImmutableMap.copyOf(runs);
+ return Collections.unmodifiableNavigableMap(runs);
}
/** Returns the run with the given id, if it exists. */
@@ -238,6 +245,31 @@ public class JobController {
return curator.readLastRun(id, type);
}
+ /** Returns the last completed of the given job. */
+ public Optional<Run> lastCompleted(JobId id) {
+ return Optional.ofNullable(curator.readHistoricRuns(id.application(), id.type())
+ .lastEntry().getValue());
+ }
+
+ /** Returns the first failing of the given job. */
+ public Optional<Run> firstFailing(JobId id) {
+ Run failed = null;
+ loop: for (Run run : runs(id).descendingMap().values())
+ switch (run.status()) {
+ case running: continue loop;
+ case success: break loop;
+ default: failed = run;
+ }
+ return Optional.ofNullable(failed);
+ }
+
+ /** Returns the last success of the given job. */
+ public Optional<Run> lastSuccess(JobId id) {
+ return runs(id).descendingMap().values().stream()
+ .filter(run -> run.status() == RunStatus.success)
+ .findFirst();
+ }
+
/** Returns the run with the given id, provided it is still active. */
public Optional<Run> active(RunId id) {
return last(id.application(), id.type())
@@ -274,11 +306,19 @@ public class JobController {
locked(id.application(), id.type(), runs -> {
runs.put(run.id(), finishedRun);
long last = id.number();
+ long successes = runs.values().stream().filter(old -> old.status() == RunStatus.success).count();
var oldEntries = runs.entrySet().iterator();
for (var old = oldEntries.next();
old.getKey().number() <= last - historyLength
|| old.getValue().start().isBefore(controller.clock().instant().minus(maxHistoryAge));
old = oldEntries.next()) {
+
+ // Make sure we keep the last success and the first failing
+ if (successes == 1 && old.getValue().status() == RunStatus.success) {
+ oldEntries.next();
+ continue;
+ }
+
logs.delete(old.getKey());
oldEntries.remove();
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java
index dbd52fc6d02..037fb4da987 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java
@@ -39,6 +39,7 @@ import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.NavigableMap;
import java.util.Optional;
import java.util.Set;
import java.util.SortedMap;
@@ -381,7 +382,7 @@ public class CuratorDb {
return readSlime(lastRunPath(id, type)).map(runSerializer::runFromSlime);
}
- public SortedMap<RunId, Run> readHistoricRuns(ApplicationId id, JobType type) {
+ public NavigableMap<RunId, Run> readHistoricRuns(ApplicationId id, JobType type) {
return readSlime(runsPath(id, type)).map(runSerializer::runsFromSlime).orElse(new TreeMap<>(comparing(RunId::number)));
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
index dda1fb881a7..b84df02e583 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
@@ -22,6 +22,7 @@ import com.yahoo.vespa.hosted.controller.deployment.Versions;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.EnumMap;
+import java.util.NavigableMap;
import java.util.Optional;
import java.util.SortedMap;
import java.util.TreeMap;
@@ -90,8 +91,8 @@ class RunSerializer {
return runFromSlime(slime.get());
}
- SortedMap<RunId, Run> runsFromSlime(Slime slime) {
- SortedMap<RunId, Run> runs = new TreeMap<>(comparing(RunId::number));
+ NavigableMap<RunId, Run> runsFromSlime(Slime slime) {
+ NavigableMap<RunId, Run> runs = new TreeMap<>(comparing(RunId::number));
Inspector runArray = slime.get();
runArray.traverse((ArrayTraverser) (__, runObject) -> {
Run run = runFromSlime(runObject);
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java
index 0f6aec804e2..d3752fcaff4 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java
@@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion;
+import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.SourceRevision;
import com.yahoo.vespa.hosted.controller.application.ApplicationPackage;
@@ -242,24 +243,74 @@ public class JobRunnerTest {
inThreadExecutor(), (id, step) -> Optional.of(running));
TenantAndApplicationId appId = tester.createApplication("tenant", "real", "default").id();
- ApplicationId id = appId.defaultInstance();
+ ApplicationId instanceId = appId.defaultInstance();
+ JobId jobId = new JobId(instanceId, systemTest);
jobs.submit(appId, versions.targetApplication().source().get(), "a@b", 2, applicationPackage, new byte[0]);
+ assertFalse(jobs.lastSuccess(jobId).isPresent());
for (int i = 0; i < jobs.historyLength(); i++) {
- jobs.start(id, systemTest, versions);
+ jobs.start(instanceId, systemTest, versions);
runner.run();
}
- assertEquals(256, jobs.runs(id, systemTest).size());
- assertTrue(jobs.details(new RunId(id, systemTest, 1)).isPresent());
+ assertEquals(256, jobs.runs(jobId).size());
+ assertTrue(jobs.details(new RunId(instanceId, systemTest, 1)).isPresent());
- jobs.start(id, systemTest, versions);
+ jobs.start(instanceId, systemTest, versions);
runner.run();
- assertEquals(256, jobs.runs(id, systemTest).size());
- assertEquals(2, jobs.runs(id, systemTest).keySet().iterator().next().number());
- assertFalse(jobs.details(new RunId(id, systemTest, 1)).isPresent());
- assertTrue(jobs.details(new RunId(id, systemTest, 257)).isPresent());
+ assertEquals(256, jobs.runs(jobId).size());
+ assertEquals(2, jobs.runs(jobId).keySet().iterator().next().number());
+ assertFalse(jobs.details(new RunId(instanceId, systemTest, 1)).isPresent());
+ assertTrue(jobs.details(new RunId(instanceId, systemTest, 257)).isPresent());
+
+ JobRunner failureRunner = new JobRunner(tester.controller(), Duration.ofDays(1), new JobControl(tester.controller().curator()),
+ inThreadExecutor(), (id, step) -> Optional.of(error));
+
+ // Make all but the oldest of the 256 jobs a failure.
+ for (int i = 0; i < jobs.historyLength() - 1; i++) {
+ jobs.start(instanceId, systemTest, versions);
+ failureRunner.run();
+ }
+ assertEquals(256, jobs.runs(jobId).size());
+ assertEquals(257, jobs.runs(jobId).keySet().iterator().next().number());
+ assertEquals(257, jobs.lastSuccess(jobId).get().id().number());
+ assertEquals(258, jobs.firstFailing(jobId).get().id().number());
+
+ // Oldest success is kept even though it would normally overflow.
+ jobs.start(instanceId, systemTest, versions);
+ failureRunner.run();
+ assertEquals(257, jobs.runs(jobId).size());
+ assertEquals(257, jobs.runs(jobId).keySet().iterator().next().number());
+ assertEquals(257, jobs.lastSuccess(jobId).get().id().number());
+ assertEquals(258, jobs.firstFailing(jobId).get().id().number());
+
+ // First failure after the last success is also kept.
+ jobs.start(instanceId, systemTest, versions);
+ failureRunner.run();
+ assertEquals(258, jobs.runs(jobId).size());
+ assertEquals(257, jobs.runs(jobId).keySet().iterator().next().number());
+ assertEquals(258, jobs.runs(jobId).keySet().stream().skip(1).iterator().next().number());
+ assertEquals(257, jobs.lastSuccess(jobId).get().id().number());
+ assertEquals(258, jobs.firstFailing(jobId).get().id().number());
+
+ // No other jobs are kept with repeated failures.
+ jobs.start(instanceId, systemTest, versions);
+ failureRunner.run();
+ assertEquals(258, jobs.runs(jobId).size());
+ assertEquals(257, jobs.runs(jobId).keySet().iterator().next().number());
+ assertEquals(258, jobs.runs(jobId).keySet().stream().skip(1).iterator().next().number());
+ assertEquals(260, jobs.runs(jobId).keySet().stream().skip(2).iterator().next().number());
+ assertEquals(257, jobs.lastSuccess(jobId).get().id().number());
+ assertEquals(258, jobs.firstFailing(jobId).get().id().number());
+
+ // history length returns to 256 when a new success is recorded.
+ jobs.start(instanceId, systemTest, versions);
+ runner.run();
+ assertEquals(256, jobs.runs(jobId).size());
+ assertEquals(261, jobs.runs(jobId).keySet().iterator().next().number());
+ assertEquals(516, jobs.lastSuccess(jobId).get().id().number());
+ assertFalse(jobs.firstFailing(jobId).isPresent());
}
@Test