diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2019-11-11 10:04:08 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-11-11 10:04:08 +0100 |
commit | fd61d661ac5f28263a48ac6051c17c205f8e101b (patch) | |
tree | 8053336f34d84470c2cbc322bda7ac6709289fea /controller-server | |
parent | c6f92e9137324e7caca0dca219c0db95645acbc0 (diff) | |
parent | 357e91af802db4697436cb22a9ff525476ba7d79 (diff) |
Merge pull request #11252 from vespa-engine/jvenstad/instance-orchestration
Consider all known runs when checking whether versions are verified
Diffstat (limited to 'controller-server')
15 files changed, 291 insertions, 162 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java index d28df826957..21a1b3c1eda 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java @@ -2,7 +2,6 @@ package com.yahoo.vespa.hosted.controller; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; import com.yahoo.component.Version; import com.yahoo.config.application.api.DeploymentInstanceSpec; import com.yahoo.config.application.api.DeploymentSpec; @@ -57,13 +56,13 @@ import com.yahoo.vespa.hosted.controller.application.Endpoint; import com.yahoo.vespa.hosted.controller.application.EndpointId; import com.yahoo.vespa.hosted.controller.application.JobList; import com.yahoo.vespa.hosted.controller.application.JobStatus; -import com.yahoo.vespa.hosted.controller.application.JobStatus.JobRun; import com.yahoo.vespa.hosted.controller.application.SystemApplication; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; import com.yahoo.vespa.hosted.controller.athenz.impl.AthenzFacade; import com.yahoo.vespa.hosted.controller.concurrent.Once; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger; import com.yahoo.vespa.hosted.controller.deployment.Run; +import com.yahoo.vespa.hosted.controller.deployment.Versions; import com.yahoo.vespa.hosted.controller.dns.NameServiceQueue.Priority; import com.yahoo.vespa.hosted.controller.maintenance.RoutingPolicies; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; @@ -75,7 +74,6 @@ import com.yahoo.vespa.hosted.controller.tenant.AthenzTenant; import com.yahoo.vespa.hosted.controller.tenant.Tenant; import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import com.yahoo.vespa.hosted.rotation.config.RotationsConfig; -import com.yahoo.yolean.Exceptions; import java.net.URI; import java.security.Principal; @@ -379,16 +377,14 @@ public class ApplicationController { else { JobType jobType = JobType.from(controller.system(), zone) .orElseThrow(() -> new IllegalArgumentException("No job is known for " + zone + ".")); - Optional<JobStatus> job = Optional.ofNullable(application.get().require(instance).deploymentJobs().jobStatus().get(jobType)); - if ( job.isEmpty() - || job.get().lastTriggered().isEmpty() - || job.get().lastCompleted().isPresent() && job.get().lastCompleted().get().at().isAfter(job.get().lastTriggered().get().at())) + var run = controller.jobController().last(instanceId, jobType); + if (run.map(Run::hasEnded).orElse(true)) return unexpectedDeployment(instanceId, zone); - JobRun triggered = job.get().lastTriggered().get(); - platformVersion = preferOldestVersion ? triggered.sourcePlatform().orElse(triggered.platform()) - : triggered.platform(); - applicationVersion = preferOldestVersion ? triggered.sourceApplication().orElse(triggered.application()) - : triggered.application(); + Versions versions = run.get().versions(); + platformVersion = preferOldestVersion ? versions.sourcePlatform().orElse(versions.targetPlatform()) + : versions.targetPlatform(); + applicationVersion = preferOldestVersion ? versions.sourceApplication().orElse(versions.targetApplication()) + : versions.targetApplication(); applicationPackage = getApplicationPackage(instanceId, applicationVersion); applicationPackage = withTesterCertificate(applicationPackage, instanceId, jobType); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java index 3fe8e9f52c3..99f5572ad60 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java @@ -92,7 +92,7 @@ public class ApplicationList extends AbstractFilteringList<Application, Applicat /** Returns the subset of applications which currently have failing jobs */ public ApplicationList failing() { return matching(application -> application.instances().values().stream() - .anyMatch(instance -> instance.deploymentJobs().hasFailures())); + .anyMatch(instance -> instance.deploymentJobs().hasFailures())); } /** Returns the subset of applications which have been failing an upgrade to the given version since the given instant */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index 1582bc144f4..092677255f2 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -1,5 +1,6 @@ package com.yahoo.vespa.hosted.controller.deployment; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.InstanceName; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId; @@ -9,6 +10,10 @@ import java.util.Map; import java.util.Objects; import java.util.stream.Collectors; +import static java.util.stream.Collectors.collectingAndThen; +import static java.util.stream.Collectors.groupingBy; +import static java.util.stream.Collectors.toUnmodifiableList; + /** * Status of the deployment jobs of an {@link Application}. * @@ -17,33 +22,38 @@ import java.util.stream.Collectors; public class DeploymentStatus { private final Application application; - private final Map<JobId, JobStatus> jobs; + private final JobList jobs; public DeploymentStatus(Application application, Map<JobId, JobStatus> jobs) { this.application = Objects.requireNonNull(application); - this.jobs = Map.copyOf(jobs); + this.jobs = JobList.from(jobs.values()); } public Application application() { return application; } - public Map<JobId, JobStatus> jobs() { + public JobList jobs() { return jobs; } public boolean hasFailures() { - return ! JobList.from(jobs.values()) - .failing() - .not().withStatus(RunStatus.outOfCapacity) - .isEmpty(); + return ! jobs.failing() + .not().withStatus(RunStatus.outOfCapacity) + .isEmpty(); } public Map<JobType, JobStatus> instanceJobs(InstanceName instance) { - return jobs.entrySet().stream() - .filter(entry -> entry.getKey().application().equals(application.id().instance(instance))) - .collect(Collectors.toUnmodifiableMap(entry -> entry.getKey().type(), - entry -> entry.getValue())); + return jobs.asList().stream() + .filter(job -> job.id().application().equals(application.id().instance(instance))) + .collect(Collectors.toUnmodifiableMap(job -> job.id().type(), + job -> job)); + } + + public Map<ApplicationId, JobList> instanceJobs() { + return jobs.asList().stream() + .collect(groupingBy(job -> job.id().application(), + collectingAndThen(toUnmodifiableList(), JobList::from))); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java new file mode 100644 index 00000000000..779beab60d1 --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java @@ -0,0 +1,82 @@ +package com.yahoo.vespa.hosted.controller.deployment; + +import com.yahoo.collections.AbstractFilteringList; +import com.yahoo.component.Version; +import com.yahoo.vespa.hosted.controller.Instance; +import com.yahoo.vespa.hosted.controller.application.ApplicationList; + +import java.time.Instant; +import java.util.Collection; + +/** + * List for filtering deployment status of applications, for inspection and decision making. + * + * @author jonmv + */ +public class DeploymentStatusList extends AbstractFilteringList<DeploymentStatus, DeploymentStatusList> { + + private DeploymentStatusList(Collection<? extends DeploymentStatus> items, boolean negate) { + super(items, negate, DeploymentStatusList::new); + } + + public static DeploymentStatusList from(Collection<? extends DeploymentStatus> status) { + return new DeploymentStatusList(status, false); + } + + public ApplicationList asApplicationList() { + return ApplicationList.from(mapToList(DeploymentStatus::application)); + } + + public DeploymentStatusList failing() { + return matching(DeploymentStatus::hasFailures); + } + + public DeploymentStatusList failingUpgrade() { + return matching(status -> status.instanceJobs().values().stream() + .anyMatch(jobs -> ! jobs.failing().not().failingApplicationChange().isEmpty())); + } + + /** Returns the subset of applications which have been failing an upgrade to the given version since the given instant */ + public DeploymentStatusList failingUpgradeToVersionSince(Version version, Instant threshold) { + return matching(status -> status.instanceJobs().values().stream() + .anyMatch(jobs -> failingUpgradeToVersionSince(jobs, version, threshold))); + } + + /** Returns the subset of applications which have been failing an application change since the given instant */ + public DeploymentStatusList failingApplicationChangeSince(Instant threshold) { + return matching(status -> status.instanceJobs().values().stream() + .anyMatch(jobs -> failingApplicationChangeSince(jobs, threshold))); + } + + /** Returns the subset of applications which currently have failing jobs on the given version */ + public DeploymentStatusList failingOn(Version version) { + return matching(status -> status.instanceJobs().values().stream() + .anyMatch(jobs -> failingOn(jobs, version))); + } + + /** Returns the subset of applications which started failing on the given version */ + public DeploymentStatusList startedFailingOn(Version version) { + return matching(status -> status.instanceJobs().values().stream() + .anyMatch(jobs -> ! jobs.firstFailing().on(version).isEmpty())); + } + + private static boolean failingOn(JobList jobs, Version version) { + return ! jobs.failing() + .lastCompleted().on(version) + .isEmpty(); + } + + private static boolean failingUpgradeToVersionSince(JobList jobs, Version version, Instant threshold) { + return ! jobs.not().failingApplicationChange() + .firstFailing().endedBefore(threshold) + .lastCompleted().on(version) + .isEmpty(); + } + + private static boolean failingApplicationChangeSince(JobList jobs, Instant threshold) { + return jobs.failingApplicationChange() + .firstFailing().endedBefore(threshold) + .isEmpty(); + } + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index f1b93c7b3b2..90d4b231ec6 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -205,8 +205,9 @@ public class DeploymentTrigger { controller.systemVersion()); String reason = "Job triggered manually by " + user; var jobStatus = jobs.deploymentStatus(application).instanceJobs(instance.name()); - return (jobType.isProduction() && ! isTested(jobStatus, versions) - ? testJobs(application.deploymentSpec(), application.change(), instance, jobStatus, versions, reason, clock.instant(), __ -> true).stream() + var jobList = JobList.from(jobStatus.values()); + return (jobType.isProduction() && ! isTested(jobList, versions) + ? testJobs(application.deploymentSpec(), application.change(), instance, jobList, versions, reason, clock.instant(), __ -> true).stream() : Stream.of(deploymentJob(instance, versions, application.change(), jobType, jobStatus.get(jobType), reason, clock.instant()))) .peek(this::trigger) .map(Job::jobType).collect(toList()); @@ -301,10 +302,11 @@ public class DeploymentTrigger { DeploymentStatus deploymentStatus = this.jobs.deploymentStatus(application); for (Instance instance : instances) { var jobStatus = deploymentStatus.instanceJobs(instance.name()); + var jobList = JobList.from(jobStatus.values()); Change change = application.change(); - Optional<Instant> completedAt = max(Optional.ofNullable(jobStatus.get(systemTest)) + Optional<Instant> completedAt = max(jobList.type(systemTest).first() .<Instant>flatMap(job -> job.lastSuccess().map(run -> run.end().get())), - Optional.ofNullable(jobStatus.get(stagingTest)) + jobList.type(stagingTest).first() .<Instant>flatMap(job -> job.lastSuccess().map(run -> run.end().get()))); String reason = "New change available"; List<Job> testJobs = null; // null means "uninitialised", while empty means "don't run any jobs". @@ -318,17 +320,14 @@ public class DeploymentTrigger { for (JobType job : remainingJobs) { Versions versions = Versions.from(change, application, deploymentFor(instance, job), controller.systemVersion()); - if (isTested(jobStatus, versions)) { - if (completedAt.isPresent() && canTrigger(job, jobStatus, versions, instance, application.deploymentSpec(), stepJobs)) { + if (isTested(jobList, versions)) { + if (completedAt.isPresent() && canTrigger(job, jobList, versions, instance, application.deploymentSpec(), stepJobs)) { jobs.add(deploymentJob(instance, versions, change, job, jobStatus.get(job), reason, completedAt.get())); } - if ( ! alreadyTriggered(jobStatus, versions) && testJobs == null) { - testJobs = emptyList(); - } } else if (testJobs == null) { testJobs = testJobs(application.deploymentSpec(), - change, instance, jobStatus, versions, + change, instance, jobList, versions, String.format("Testing deployment for %s (%s)", job.jobName(), versions.toString()), completedAt.orElseGet(clock::instant)); @@ -349,7 +348,7 @@ public class DeploymentTrigger { } } if (testJobs == null) { // If nothing to test, but outstanding commits, test those. - testJobs = testJobs(application.deploymentSpec(), change, instance, jobStatus, + testJobs = testJobs(application.deploymentSpec(), change, instance, jobList, Versions.from(application.outstandingChange().onTopOf(change), application, steps.sortedDeployments(instance.productionDeployments().values()).stream().findFirst(), @@ -363,21 +362,22 @@ public class DeploymentTrigger { } /** Returns whether given job should be triggered */ - private boolean canTrigger(JobType job, Map<JobType, JobStatus> status, Versions versions, Instance instance, DeploymentSpec deploymentSpec, List<JobType> parallelJobs) { - if (status.get(job).isRunning()) return false; + private boolean canTrigger(JobType type, JobList jobList, Versions versions, Instance instance, DeploymentSpec deploymentSpec, List<JobType> parallelJobs) { + if ( ! jobList.type(type).running().isEmpty()) return false; // Are we already running jobs which are not in the set which can run in parallel with this? - if (parallelJobs != null && ! parallelJobs.containsAll(runningProductionJobs(status))) return false; + if ( parallelJobs != null + && ! parallelJobs.containsAll(jobList.running().production().mapToList(job -> job.id().type()))) return false; // Are there another suspended deployment such that we shouldn't simultaneously change this? - if (job.isProduction() && isSuspendedInAnotherZone(instance, job.zone(controller.system()))) return false; + if (type.isProduction() && isSuspendedInAnotherZone(instance, type.zone(controller.system()))) return false; - return triggerAt(clock.instant(), job, status.get(job), versions, instance, deploymentSpec); + return triggerAt(clock.instant(), type, jobList.type(type).first().get(), versions, instance, deploymentSpec); } /** Returns whether given job should be triggered */ - private boolean canTrigger(JobType job, Map<JobType, JobStatus> status, Versions versions, Instance instance, DeploymentSpec deploymentSpec) { - return canTrigger(job, status, versions, instance, deploymentSpec, null); + private boolean canTrigger(JobType job, JobList jobList, Versions versions, Instance instance, DeploymentSpec deploymentSpec) { + return canTrigger(job, jobList, versions, instance, deploymentSpec, null); } private boolean isSuspendedInAnotherZone(Instance instance, ZoneId zone) { @@ -466,28 +466,10 @@ public class DeploymentTrigger { return change.downgrades(deployment.version()) || change.downgrades(deployment.applicationVersion()); } - private boolean isTested(Map<JobType, JobStatus> status, Versions versions) { - return testedIn(systemTest, status.get(systemTest), versions) - && testedIn(stagingTest, status.get(stagingTest), versions) - || alreadyTriggered(status, versions); - } - - public boolean testedIn(JobType testType, JobStatus status, Versions versions) { - if (testType == systemTest) - return successOn(status, versions).isPresent(); - if (testType == stagingTest) - return successOn(status, versions).map(Run::versions).filter(versions::sourcesMatchIfPresent).isPresent(); - throw new IllegalArgumentException(testType + " is not a test job!"); - } - - public boolean alreadyTriggered(Map<JobType, JobStatus> status, Versions versions) { - return status.values().stream() - .filter(job -> job.id().type().isProduction()) - .anyMatch(job -> job.lastTriggered() - .map(Run::versions) - .filter(versions::targetsMatch) - .filter(versions::sourcesMatchIfPresent) - .isPresent()); + public boolean isTested(JobList jobs, Versions versions) { + return ! jobs.type(systemTest).successOn(versions).isEmpty() + && ! jobs.type(stagingTest).successOn(versions).isEmpty() + || ! jobs.production().triggeredOn(versions).isEmpty(); } // ---------- Change management o_O ---------- @@ -496,8 +478,7 @@ public class DeploymentTrigger { if ( ! application.deploymentSpec().instances().stream() .allMatch(instance -> instance.canChangeRevisionAt(clock.instant()))) return false; if (application.change().application().isPresent()) return true; // Replacing a previous application change is ok. - for (Instance instance : application.instances().values()) - if (instance.deploymentJobs().hasFailures()) return true; // Allow changes to fix upgrade problems. + if (jobs.deploymentStatus(application).hasFailures()) return true; // Allow changes to fix upgrade problems. return application.change().platform().isEmpty(); } @@ -527,23 +508,22 @@ public class DeploymentTrigger { /** * Returns the list of test jobs that should run now, and that need to succeed on the given versions for it to be considered tested. */ - private List<Job> testJobs(DeploymentSpec deploymentSpec, Change change, Instance instance, Map<JobType, JobStatus> status, Versions versions, + private List<Job> testJobs(DeploymentSpec deploymentSpec, Change change, Instance instance, JobList jobList, Versions versions, String reason, Instant availableSince) { - return testJobs(deploymentSpec, change, instance, status, versions, reason, availableSince, - jobType -> canTrigger(jobType, status, versions, instance, deploymentSpec)); + return testJobs(deploymentSpec, change, instance, jobList, versions, reason, availableSince, + jobType -> canTrigger(jobType, jobList, versions, instance, deploymentSpec)); } /** * Returns the list of test jobs that need to succeed on the given versions for it to be considered tested, filtered by the given condition. */ - private List<Job> testJobs(DeploymentSpec deploymentSpec, Change change, Instance instance, Map<JobType, JobStatus> status, Versions versions, + private List<Job> testJobs(DeploymentSpec deploymentSpec, Change change, Instance instance, JobList jobList, Versions versions, String reason, Instant availableSince, Predicate<JobType> condition) { List<Job> jobs = new ArrayList<>(); for (JobType jobType : new DeploymentSteps(deploymentSpec.requireInstance(instance.name()), controller::system).testJobs()) { // TODO jonmv: Allow cross-instance validation - Optional<Run> completion = successOn(status.get(jobType), versions) - .filter(run -> versions.sourcesMatchIfPresent(run.versions()) || jobType == systemTest); - if (completion.isEmpty() && condition.test(jobType)) - jobs.add(deploymentJob(instance, versions, change, jobType, status.get(jobType), reason, availableSince)); + if ( jobList.type(jobType).successOn(versions).isEmpty() + && condition.test(jobType)) + jobs.add(deploymentJob(instance, versions, change, jobType, jobList.type(jobType).first().get(), reason, availableSince)); } return jobs; } @@ -552,8 +532,8 @@ public class DeploymentTrigger { if (jobStatus.isOutOfCapacity()) reason += "; retrying on out of capacity"; var triggering = JobRun.triggering(versions.targetPlatform(), versions.targetApplication(), - versions.sourcePlatform(), versions.sourceApplication(), - reason, clock.instant()); + versions.sourcePlatform(), versions.sourceApplication(), + reason, clock.instant()); return new Job(instance, triggering, jobType, availableSince, jobStatus.isOutOfCapacity(), change.application().isPresent()); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java index e6c59b464a6..84fc7da3fec 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java @@ -20,6 +20,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.SourceRevision; import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterCloud; import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterId; +import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; @@ -32,6 +33,7 @@ import java.security.cert.X509Certificate; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; @@ -298,6 +300,13 @@ public class JobController { status -> status))); } + /** Adds deployment status to each of the given applications. */ + public DeploymentStatusList deploymentStatuses(ApplicationList applications) { + return DeploymentStatusList.from(applications.asList().stream() + .map(this::deploymentStatus) + .collect(toUnmodifiableList())); + } + /** Changes the status of the given step, for the given run, provided it is still active. */ public void update(RunId id, RunStatus status, LockedStep step) { locked(id, run -> run.with(status, step)); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java index 1ef83153bef..f771e9314eb 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java @@ -9,9 +9,11 @@ import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import java.time.Instant; import java.util.Collection; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.function.Function; import java.util.function.Predicate; +import java.util.stream.Collectors; /** * A list of deployment jobs that can be filtered in various ways. @@ -41,7 +43,11 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { /** Returns the subset of jobs which are currently failing */ public JobList failing() { - return matching(job -> ! job.isSuccess()); + return matching(job -> job.lastCompleted().isPresent() && ! job.isSuccess()); + } + + public JobList running() { + return matching(job -> job.isRunning()); } /** Returns the subset of jobs which must be failing due to an application change */ @@ -55,8 +61,13 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { } /** Returns the subset of jobs of the given type -- most useful when negated */ + public JobList type(Collection<? extends JobType> types) { + return matching(job -> types.contains(job.id().type())); + } + + /** Returns the subset of jobs of the given type -- most useful when negated */ public JobList type(JobType... types) { - return matching(job -> List.of(types).contains(job.id().type())); + return type(List.of(types)); } /** Returns the subset of jobs of which are production jobs */ @@ -64,6 +75,16 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { return matching(job -> job.id().type().isProduction()); } + /** Returns the jobs with any runs matching the given versions — targets only for system test, everything present otherwise. */ + public JobList triggeredOn(Versions versions) { + return matching(job -> ! RunList.from(job).on(versions).isEmpty()); + } + + /** Returns the jobs with successful runs matching the given versions — targets only for system test, everything present otherwise. */ + public JobList successOn(Versions versions) { + return matching(job -> ! RunList.from(job).status(RunStatus.success).on(versions).isEmpty()); + } + // ----------------------------------- JobRun filtering /** Returns the list in a state where the next filter is for the lastTriggered run type */ @@ -86,7 +107,6 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { return new RunFilter(JobStatus::firstFailing); } - /** Allows sub-filters for runs of the given kind */ public class RunFilter { @@ -101,6 +121,21 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { return matching(run -> true); } + /** Returns the runs of the given kind, mapped by the given function, as a list. */ + public <OtherType> List<OtherType> mapToList(Function<? super Run, OtherType> mapper) { + return present().mapToList(which.andThen(Optional::get).andThen(mapper)); + } + + /** Returns the subset of jobs where the run of the given type occurred before the given instant */ + public JobList endedBefore(Instant threshold) { + return matching(run -> run.end().orElse(Instant.MAX).isBefore(threshold)); + } + + /** Returns the subset of jobs where the run of the given type occurred after the given instant */ + public JobList endedAfter(Instant threshold) { + return matching(run -> run.end().orElse(Instant.MIN).isAfter(threshold)); + } + /** Returns the subset of jobs where the run of the given type occurred before the given instant */ public JobList startedBefore(Instant threshold) { return matching(run -> run.start().isBefore(threshold)); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunList.java new file mode 100644 index 00000000000..dc17e1c17bb --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunList.java @@ -0,0 +1,43 @@ +package com.yahoo.vespa.hosted.controller.deployment; + +import com.yahoo.collections.AbstractFilteringList; +import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; + +import java.util.Collection; +import java.util.List; + +/** + * List for filtering deployment job {@link Run}s. + * + * @author jonmv + */ +public class RunList extends AbstractFilteringList<Run, RunList> { + + private RunList(Collection<? extends Run> items, boolean negate) { + super(items, negate, RunList::new); + } + + public static RunList from(Collection<? extends Run> runs) { + return new RunList(runs, false); + } + + public static RunList from(JobStatus job) { + return from(job.runs().descendingMap().values()); + } + + /** Returns the jobs with runs matching the given versions — targets only for system test, everything present otherwise. */ + public RunList on(Versions versions) { + return matching(run -> matchingVersions(run, versions)); + } + + /** Returns the runs with status among the given. */ + public RunList status(RunStatus... status) { + return matching(run -> List.of(status).contains(run.status())); + } + + private static boolean matchingVersions(Run run, Versions versions) { + return versions.targetsMatch(run.versions()) + && (versions.sourcesMatchIfPresent(run.versions()) || run.id().type() == JobType.systemTest); + } + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 568366d7f2a..ccb802d314a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -10,8 +10,8 @@ import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationV import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics; -import com.yahoo.vespa.hosted.controller.application.JobList; -import com.yahoo.vespa.hosted.controller.application.JobStatus; +import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatusList; +import com.yahoo.vespa.hosted.controller.deployment.JobList; import com.yahoo.vespa.hosted.controller.rotation.RotationLock; import com.yahoo.vespa.hosted.controller.versions.NodeVersions; import com.yahoo.vespa.hosted.controller.versions.VespaVersion; @@ -71,28 +71,25 @@ public class MetricsReporter extends Maintainer { } private void reportDeploymentMetrics() { - List<Application> applications = ApplicationList.from(controller().applications().asList()) - .withProductionDeployment().asList().stream() - .collect(Collectors.toUnmodifiableList()); - List<Instance> instances = applications.stream() - .flatMap(application -> application.instances().values().stream()) - .collect(Collectors.toUnmodifiableList()); + ApplicationList applications = ApplicationList.from(controller().applications().asList()) + .withProductionDeployment(); + DeploymentStatusList deployments = controller().jobController().deploymentStatuses(applications); - metric.set(DEPLOYMENT_FAIL_METRIC, deploymentFailRatio(instances) * 100, metric.createContext(Map.of())); + metric.set(DEPLOYMENT_FAIL_METRIC, deploymentFailRatio(deployments) * 100, metric.createContext(Map.of())); - averageDeploymentDurations(instances, clock.instant()).forEach((application, duration) -> { - metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.getSeconds(), metric.createContext(dimensions(application))); + averageDeploymentDurations(deployments, clock.instant()).forEach((instance, duration) -> { + metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.getSeconds(), metric.createContext(dimensions(instance))); }); - deploymentsFailingUpgrade(instances).forEach((application, failingJobs) -> { - metric.set(DEPLOYMENT_FAILING_UPGRADES, failingJobs, metric.createContext(dimensions(application))); + deploymentsFailingUpgrade(deployments).forEach((instance, failingJobs) -> { + metric.set(DEPLOYMENT_FAILING_UPGRADES, failingJobs, metric.createContext(dimensions(instance))); }); - deploymentWarnings(instances).forEach((application, warnings) -> { + deploymentWarnings(deployments).forEach((application, warnings) -> { metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(application))); }); - for (Application application : applications) + for (Application application : applications.asList()) application.latestVersion() .flatMap(ApplicationVersion::buildTime) .ifPresent(buildTime -> metric.set(DEPLOYMENT_BUILD_AGE_SECONDS, @@ -142,47 +139,42 @@ public class MetricsReporter extends Maintainer { return nodesFailingUpgrade; } - private static double deploymentFailRatio(List<Instance> instances) { - return instances.stream() - .mapToInt(instance -> instance.deploymentJobs().hasFailures() ? 1 : 0) - .average().orElse(0); + private static double deploymentFailRatio(DeploymentStatusList statuses) { + return statuses.asList().stream() + .mapToInt(status -> status.hasFailures() ? 1 : 0) + .average().orElse(0); } - private static Map<ApplicationId, Duration> averageDeploymentDurations(List<Instance> instances, Instant now) { - return instances.stream() - .collect(Collectors.toMap(Instance::id, - instance -> averageDeploymentDuration(instance, now))); + private static Map<ApplicationId, Duration> averageDeploymentDurations(DeploymentStatusList statuses, Instant now) { + return statuses.asList().stream() + .flatMap(status -> status.instanceJobs().entrySet().stream()) + .collect(Collectors.toUnmodifiableMap(entry -> entry.getKey(), + entry -> averageDeploymentDuration(entry.getValue(), now))); } - private static Map<ApplicationId, Integer> deploymentsFailingUpgrade(List<Instance> instances) { - return instances.stream() - .collect(Collectors.toMap(Instance::id, MetricsReporter::deploymentsFailingUpgrade)); + private static Map<ApplicationId, Integer> deploymentsFailingUpgrade(DeploymentStatusList statuses) { + return statuses.asList().stream() + .flatMap(status -> status.instanceJobs().entrySet().stream()) + .collect(Collectors.toUnmodifiableMap(entry -> entry.getKey(), + entry -> deploymentsFailingUpgrade(entry.getValue()))); } - private static int deploymentsFailingUpgrade(Instance instance) { - return JobList.from(instance).upgrading().failing().size(); + private static int deploymentsFailingUpgrade(JobList jobs) { + return jobs.failing().not().failingApplicationChange().size(); } - private static Duration averageDeploymentDuration(Instance instance, Instant now) { - List<Duration> jobDurations = instance.deploymentJobs().jobStatus().values().stream() - .filter(status -> status.lastTriggered().isPresent()) - .map(status -> { - Instant triggeredAt = status.lastTriggered().get().at(); - Instant runningUntil = status.lastCompleted() - .map(JobStatus.JobRun::at) - .filter(at -> at.isAfter(triggeredAt)) - .orElse(now); - return Duration.between(triggeredAt, runningUntil); - }) - .collect(Collectors.toList()); + private static Duration averageDeploymentDuration(JobList jobs, Instant now) { + List<Duration> jobDurations = jobs.lastTriggered() + .mapToList(run -> Duration.between(run.start(), run.end().orElse(now))); return jobDurations.stream() .reduce(Duration::plus) .map(totalDuration -> totalDuration.dividedBy(jobDurations.size())) .orElse(Duration.ZERO); } - private static Map<ApplicationId, Integer> deploymentWarnings(List<Instance> instances) { - return instances.stream() + private static Map<ApplicationId, Integer> deploymentWarnings(DeploymentStatusList statuses) { + return statuses.asList().stream() + .flatMap(status -> status.application().instances().values().stream()) .collect(Collectors.toMap(Instance::id, a -> maxWarningCountOf(a.deployments().values()))); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java index 9a9a9798c6d..ed9612a7343 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java @@ -25,8 +25,10 @@ import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; import com.yahoo.vespa.hosted.controller.deployment.DeploymentSteps; import com.yahoo.vespa.hosted.controller.deployment.JobController; +import com.yahoo.vespa.hosted.controller.deployment.JobList; import com.yahoo.vespa.hosted.controller.deployment.JobStatus; import com.yahoo.vespa.hosted.controller.deployment.Run; +import com.yahoo.vespa.hosted.controller.deployment.RunList; import com.yahoo.vespa.hosted.controller.deployment.RunLog; import com.yahoo.vespa.hosted.controller.deployment.RunStatus; import com.yahoo.vespa.hosted.controller.deployment.Step; @@ -239,11 +241,12 @@ class JobControllerApiHandlerHelper { jobObject.setLong("pausedUntil", until))); int runs = 0; Cursor runArray = jobObject.setArray("runs"); + JobList jobList = JobList.from(status.values()); if (type.isTest()) { Deque<List<JobType>> pending = new ArrayDeque<>(); pendingProduction.entrySet().stream() - .filter(typeVersions -> ! controller.applications().deploymentTrigger().testedIn(type, status.get(type), typeVersions.getValue())) - .filter(typeVersions -> ! controller.applications().deploymentTrigger().alreadyTriggered(status, typeVersions.getValue())) + .filter(typeVersions -> jobList.type(type).successOn(typeVersions.getValue()).isEmpty()) + .filter(typeVersions -> jobList.production().triggeredOn(typeVersions.getValue()).isEmpty()) .collect(groupingBy(Map.Entry::getValue, LinkedHashMap::new, Collectors.mapping(Map.Entry::getKey, toList()))) @@ -279,12 +282,13 @@ class JobControllerApiHandlerHelper { pendingObject.setString("cooldown", "failed"); else { int pending = 0; - if ( ! controller.applications().deploymentTrigger().alreadyTriggered(status, versions)) { - if ( ! controller.applications().deploymentTrigger().testedIn(systemTest, status.get(systemTest), versions)) { + controller.applications().deploymentTrigger(); + if (jobList.production().triggeredOn(versions).isEmpty()) { + if (jobList.type(systemTest).successOn(versions).isEmpty()) { pending++; pendingObject.setString(shortNameOf(systemTest, controller.system()), statusOf(controller, instance.id(), systemTest, versions)); } - if ( ! controller.applications().deploymentTrigger().testedIn(stagingTest, status.get(stagingTest), versions)) { + if (jobList.type(stagingTest).successOn(versions).isEmpty()) { pending++; pendingObject.setString(shortNameOf(stagingTest, controller.system()), statusOf(controller, instance.id(), stagingTest, versions)); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java index d181ab8d38f..6016eed4704 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java @@ -337,7 +337,7 @@ public class DeploymentContext { } /** Simulate convergence time out in given job */ - public void timeOutConvergence(JobType type) { + public DeploymentContext timeOutConvergence(JobType type) { var job = jobId(type); triggerJobs(); RunId id = currentRun(job).id(); @@ -348,6 +348,7 @@ public class DeploymentContext { assertTrue(jobs.run(id).get().hasFailed()); assertTrue(jobs.run(id).get().hasEnded()); doTeardown(job); + return this; } /** Sets a single endpoint in the routing layer for the instance in this */ diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 8fb766164c2..6b0f74fd8bf 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -504,7 +504,7 @@ public class DeploymentTriggerTest { Version version2 = Version.fromString("7.2"); tester.controllerTester().upgradeSystem(version2); tester.upgrader().maintain(); - app1.runJob(systemTest).runJob(stagingTest) // tests for previous version. + app1.runJob(systemTest).runJob(stagingTest) // tests for previous version — these are "reused" later. .runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1); assertEquals(version2, app1.deployment(productionUsCentral1.zone(main)).version()); Instant triggered = app1.instance().deploymentJobs().jobStatus().get(productionUsCentral1).lastTriggered().get().at(); @@ -517,18 +517,12 @@ public class DeploymentTriggerTest { assertEquals("Change becomes latest non-broken version", Change.of(version1), app1.application().change()); // version1 proceeds 'til the last job, where it fails; us-central-1 is skipped, as current change is strictly dominated by what's deployed there. - app1.runJob(systemTest) - .runJob(stagingTest) - .failDeployment(productionEuWest1); + app1.failDeployment(productionEuWest1); assertEquals(triggered, app1.instance().deploymentJobs().jobStatus().get(productionUsCentral1).lastTriggered().get().at()); - // Eagerly triggered system and staging tests complete. - app1.runJob(systemTest).runJob(stagingTest); - // Roll out a new application version, which gives a dual change -- this should trigger us-central-1, but only as long as it hasn't yet deployed there. ApplicationVersion revision1 = app1.lastSubmission().get(); app1.submit(applicationPackage); - app1.jobAborted(productionEuWest1); ApplicationVersion revision2 = app1.lastSubmission().get(); app1.runJob(systemTest).runJob(stagingTest); assertEquals(Change.of(version1).with(revision2), app1.application().change()); @@ -583,21 +577,18 @@ public class DeploymentTriggerTest { assertEquals(firstTested, app.instance().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().platform()); app.runJob(systemTest).runJob(stagingTest); - tester.readyJobsTrigger().maintain(); // Run only once, to trigger only the production jobs. - - // Tests are not re-triggered, because the deployments that were tested have not yet been triggered on the tested versions. - assertEquals(firstTested, app.instance().deploymentJobs().jobStatus().get(systemTest).lastTriggered().get().platform()); - assertEquals(firstTested, app.instance().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().platform()); - // Finish old run of the aborted production job. - app.jobAborted(productionUsEast3); + // Test jobs for next production zone can start and run immediately. tester.triggerJobs(); - - // New upgrade is already tested for one of the jobs, which has now been triggered, and tests may run for the other job. assertNotEquals(firstTested, app.instance().deploymentJobs().jobStatus().get(systemTest).lastTriggered().get().platform()); assertNotEquals(firstTested, app.instance().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().platform()); app.runJob(systemTest).runJob(stagingTest); + // Finish old run of the aborted production job. + app.jobAborted(productionUsEast3); + + // New upgrade is already tested for both jobs. + app.failDeployment(productionEuWest1).failDeployment(productionUsEast3) .runJob(productionEuWest1).runJob(productionUsEast3); // Both jobs fail again, and must be re-triggered -- this is ok, as they are both already triggered on their current targets. diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index 7c87cbf3610..a949625baf4 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -86,6 +86,7 @@ public class MetricsReporterTest { // App spends 3 hours deploying context.submit(applicationPackage); + tester.triggerJobs(); tester.clock().advance(Duration.ofHours(1)); context.runJob(systemTest); @@ -144,8 +145,7 @@ public class MetricsReporterTest { Version version = Version.fromString("7.1"); tester.controllerTester().upgradeSystem(version); tester.upgrader().maintain(); - context.triggerJobs() - .failDeployment(systemTest) + context.failDeployment(systemTest) .failDeployment(stagingTest); reporter.maintain(); assertEquals(2, getDeploymentsFailingUpgrade(context.instanceId())); @@ -153,7 +153,6 @@ public class MetricsReporterTest { // Test and staging pass and upgrade fails in production context.runJob(systemTest) .runJob(stagingTest) - .triggerJobs() .failDeployment(productionUsWest1); reporter.maintain(); assertEquals(1, getDeploymentsFailingUpgrade(context.instanceId())); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 36039c47025..880d74b0da6 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -578,7 +578,6 @@ public class UpgraderTest { // us-central-1 succeeds upgrade to 6.3, with the revision, but us-east-3 wants to proceed with only the revision change. app.runJob(productionUsCentral1); app.runJob(systemTest).runJob(stagingTest).runJob(productionUsEast3); - app.runJob(systemTest).runJob(stagingTest); // Test last changes outside prod. assertEquals(List.of(), tester.jobs().active()); assertEquals(new Version("6.3"), app.deployment(ZoneId.from("prod", "us-central-1")).version()); @@ -936,7 +935,7 @@ public class UpgraderTest { // Application downgrades to pinned version. tester.abortAll(); - context.runJob(systemTest).runJob(stagingTest).runJob(productionUsCentral1); + context.runJob(stagingTest).runJob(productionUsCentral1); assertTrue(context.application().change().hasTargets()); context.runJob(productionUsWest1); // us-east-3 never upgraded, so no downgrade is needed. assertFalse(context.application().change().hasTargets()); @@ -1034,21 +1033,9 @@ public class UpgraderTest { assertEquals(v2, application.deployment(ZoneId.from("prod", "us-west-1")).version()); assertEquals(v1, application.deployment(ZoneId.from("prod", "us-east-3")).version()); - // Need to test v2->v3 combination again before upgrading second deployment (not really, but this is a limitation of lastSuccess) - tester.triggerJobs(); - assertEquals(v2, application.instance().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().sourcePlatform().get()); - assertEquals(v3, application.instance().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().platform()); - application.runJob(stagingTest); - // Second deployment upgrades application.runJob(productionUsWest1); - // ... now we have to test v1->v3 again :( - tester.triggerJobs(); - assertEquals(v1, application.instance().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().sourcePlatform().get()); - assertEquals(v3, application.instance().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().platform()); - application.runJob(stagingTest); - // Upgrade completes application.runJob(productionUsEast3); assertTrue("Upgrade complete", application.application().change().isEmpty()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java index b5b7c1d0ad0..27305f8956f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiTest.java @@ -305,7 +305,6 @@ public class ApplicationApiTest extends ControllerContainerTest { .userIdentity(HOSTED_VESPA_OPERATOR), new File("deploy-result.json")); - // POST (create) another application ApplicationPackage applicationPackage = new ApplicationPackageBuilder() .instances("instance1") .globalServiceId("foo") @@ -315,6 +314,7 @@ public class ApplicationApiTest extends ControllerContainerTest { .allow(ValidationId.globalEndpointChange) .build(); + // POST (create) another application tester.assertResponse(request("/application/v4/tenant/tenant2/application/application2/instance/default", POST) .userIdentity(USER_ID) .oktaAccessToken(OKTA_AT).oktaIdentityToken(OKTA_IT), |