summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorJon Marius Venstad <venstad@gmail.com>2022-02-02 11:27:58 +0100
committerJon Marius Venstad <venstad@gmail.com>2022-02-02 11:27:58 +0100
commit91dee3f7000822f39769025798cc3cbdb35843a7 (patch)
tree47b767cfac2380486a51f466d1e1dbf6515e40cb /controller-server
parent34ce6dda4876d24680986451e79c14fa919708c7 (diff)
Special-case OOC only in test zones, and catch all dependent jobs for a revision change
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java49
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java9
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java4
3 files changed, 29 insertions, 33 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
index 88a2d0b0a6b..7f7449fb38b 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
@@ -118,7 +118,7 @@ public class DeploymentStatus {
return ! allJobs.matching(job -> criticalJobs.contains(job.id()))
.failing()
- .not().withStatus(RunStatus.outOfCapacity)
+ .not().outOfTestCapacity()
.isEmpty();
}
@@ -126,39 +126,29 @@ public class DeploymentStatus {
if (visited.contains(current))
return dependents.contains(current);
- if (dependency == current) {
+ if (dependency == current)
dependents.add(current);
- visited.add(current);
- return true;
- }
+ else
+ for (StepStatus dep : current.dependencies)
+ if (fillDependents(dependency, visited, dependents, dep))
+ dependents.add(current);
- for (StepStatus dep : current.dependencies)
- if (fillDependents(dependency, visited, dependents, dep))
- dependents.add(current);
visited.add(current);
-
return dependents.contains(current);
}
- /** Whether any jobs of the given instance, or in an instance dependent on it, are failing with other errors than lack of capacity in a test zone. */
- public boolean hasFailures(InstanceName instance) {
- Set<StepStatus> dependents = new HashSet<>();
- for (StepStatus instanceStep : instanceSteps().values())
- fillDependents(instanceSteps().get(instance), new HashSet<>(), dependents, instanceStep);
-
- Set<InstanceName> dependentInstances = dependents.stream()
- .map(StepStatus::instance)
- .collect(Collectors.toSet());
- return ! allJobs.instance(dependentInstances)
- .failing()
- .not().withStatus(RunStatus.outOfCapacity)
+ /** Whether any job is failing on anything older than version, with errors other than lack of capacity in a test zone.. */
+ public boolean hasFailures(ApplicationVersion version) {
+ return ! allJobs.failing()
+ .not().outOfTestCapacity()
+ .matching(job -> job.lastTriggered().get().versions().targetApplication().compareTo(version) < 0)
.isEmpty();
}
/** Whether any jobs of this application are failing with other errors than lack of capacity in a test zone. */
public boolean hasFailures() {
return ! allJobs.failing()
- .not().withStatus(RunStatus.outOfCapacity)
+ .not().outOfTestCapacity()
.isEmpty();
}
@@ -380,12 +370,15 @@ public class DeploymentStatus {
boolean platformReadyFirst = platformReadyAt.get().isBefore(revisionReadyAt.get());
boolean revisionReadyFirst = revisionReadyAt.get().isBefore(platformReadyAt.get());
switch (application.deploymentSpec().requireInstance(job.application().instance()).upgradeRollout()) {
- case separate: // Let whichever change rolled out first, keep rolling first, unless jobs are failing.
- return hasFailures() ? List.of(change)
- : platformReadyFirst || platformReadyAt.get().equals(Instant.EPOCH) ? List.of(change.withoutApplication(), change) // No jobs have run yet, assume platform came first
- : revisionReadyFirst ? List.of(change.withoutPlatform(), change)
- : List.of(change);
- case leading: // When one change catches up, they fuse and continue together.
+ case separate: // Let whichever change rolled out first, keep rolling first, unless upgrade alone is failing.
+ return (platformReadyFirst || platformReadyAt.get().equals(Instant.EPOCH)) // Assume platform was first if no jobs have run yet.
+ ? step.job().flatMap(jobs()::get).flatMap(JobStatus::firstFailing).isPresent()
+ ? List.of(change) // Platform was first, but is failing.
+ : List.of(change.withoutApplication(), change) // Platform was first, and is OK.
+ : revisionReadyFirst
+ ? List.of(change.withoutPlatform(), change) // Revision was first.
+ : List.of(change); // Both ready at the same time, probably due to earlier failure.
+ case leading: // When one change catches up, they fuse and continue together.
return List.of(change);
case simultaneous: // Revisions are allowed to run ahead, but the job where it caught up should have both changes.
return platformReadyFirst ? List.of(change) : List.of(change.withoutPlatform(), change);
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index 6da06acb6e5..b944593bbab 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -97,7 +97,7 @@ public class DeploymentTrigger {
&& status.instanceSteps().get(instanceName)
.readyAt(outstanding)
.map(readyAt -> ! readyAt.isAfter(clock.instant())).orElse(false)
- && acceptNewApplicationVersion(status, instanceName)) {
+ && acceptNewApplicationVersion(status, instanceName, outstanding.application().get())) {
application = application.with(instanceName,
instance -> withRemainingChange(instance, instance.change().with(outstanding.application().get()), status));
}
@@ -386,13 +386,12 @@ public class DeploymentTrigger {
// ---------- Change management o_O ----------
- private boolean acceptNewApplicationVersion(DeploymentStatus status, InstanceName instance) {
+ private boolean acceptNewApplicationVersion(DeploymentStatus status, InstanceName instance, ApplicationVersion version) {
if (status.application().deploymentSpec().instance(instance).isEmpty()) return false; // Unknown instance.
- if (status.hasFailures(instance)) return true; // Allow changes to fix upgrade or previous revision problems.
+ if (status.hasFailures(version)) return true; // Allow changes to fix upgrade or previous revision problems.
DeploymentInstanceSpec spec = status.application().deploymentSpec().requireInstance(instance);
Change change = status.application().require(instance).change();
- if (change.application().isPresent() && spec.upgradeRevision() == DeploymentSpec.UpgradeRevision.separate) return false;
- return true;
+ return change.application().isEmpty() || spec.upgradeRevision() != DeploymentSpec.UpgradeRevision.separate;
}
private Instance withRemainingChange(Instance instance, Change change, DeploymentStatus status) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
index ab6816d3646..fe9c6e6f3d6 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
@@ -52,6 +52,10 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> {
return matching(job -> job.lastCompleted().isPresent() && ! job.isSuccess());
}
+ public JobList outOfTestCapacity() {
+ return matching(job -> job.isOutOfCapacity() && job.id().type().environment().isTest());
+ }
+
public JobList running() {
return matching(job -> job.isRunning());
}