diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2017-11-02 09:05:24 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-11-02 09:05:24 +0100 |
commit | 68396f1ed32d73a145d1cb1413a92f1b53adf937 (patch) | |
tree | 4d67895eaaa01067bfbe6327a8b8f10e668983a0 | |
parent | f8129d907a822ebf5e30978f63db261a9fdb2af8 (diff) | |
parent | 4df8bd86a3d0d5e93c81c89d25bd4ef1ec837940 (diff) |
Merge pull request #3963 from vespa-engine/jvenstad/correct-platform-issue-filing
List only failing applicatoin, with failures older than four hours
3 files changed, 76 insertions, 38 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java index 785aeefc57f..a0d038c0e8d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java @@ -5,7 +5,6 @@ import com.google.common.collect.ImmutableList; import com.yahoo.component.Version; import com.yahoo.config.application.api.DeploymentSpec.UpgradePolicy; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.Environment; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.ApplicationController; @@ -73,6 +72,16 @@ public class ApplicationList { return listOf(list.stream().filter(application -> ! application.deploymentJobs().hasFailures())); } + /** Returns the subset of applications which have been failing an upgrade to the given version since the given instant */ + public ApplicationList failingUpgradeToVersionSince(Version version, Instant threshold) { + return listOf(list.stream().filter(application -> failingUpgradeToVersionSince(application, version, threshold))); + } + + /** Returns the subset of applications which have been failing an application change since the given instant */ + public ApplicationList failingApplicationChangeSince(Instant threshold) { + return listOf(list.stream().filter(application -> failingApplicationChangeSince(application, threshold))); + } + /** Returns the subset of applications which currently does not have any failing jobs on the given version */ public ApplicationList notFailingOn(Version version) { return listOf(list.stream().filter(application -> ! failingOn(version, application))); @@ -177,7 +186,37 @@ public class ApplicationList { .map(status -> status.lastTriggered().get()) .anyMatch(jobRun -> jobRun.version().equals(change.version())); } - + + private static boolean failingUpgradeToVersionSince(Application application, Version version, Instant threshold) { + return application.deploymentJobs().jobStatus().values().stream() + .filter(job -> isUpgradeFailure(job)) + .filter(job -> job.firstFailing().get().at().isBefore(threshold)) + .anyMatch(job -> job.lastCompleted().get().version().equals(version)); + } + + private static boolean failingApplicationChangeSince(Application application, Instant threshold) { + return application.deploymentJobs().jobStatus().values().stream() + .filter(job -> isApplicationChangeFailure(job)) + .anyMatch(job -> job.firstFailing().get().at().isBefore(threshold)); + } + + private static boolean isUpgradeFailure(JobStatus job) { + if ( job.isSuccess()) return false; + if ( ! job.lastSuccess().isPresent()) return false; // An application which never succeeded is surely bad. + if ( ! job.lastSuccess().get().revision().isPresent()) return false; // Indicates the component job, which is not an upgrade. + if ( ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision())) return false; // Application change may be to blame. + return ! job.firstFailing().get().version().equals(job.lastSuccess().get().version()); // Return whether there is a version change. + } + + private static boolean isApplicationChangeFailure(JobStatus job) { + if ( job.isSuccess()) return false; + if ( ! job.lastSuccess().isPresent()) return true; // An application which never succeeded is surely bad. + if ( ! job.lastSuccess().get().revision().isPresent()) return true; // Indicates the component job, which is always an application change. + if ( ! job.firstFailing().get().version().equals(job.lastSuccess().get().version())) return false; // Version change may be to blame. + return ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision()); // Return whether there is an application change. + } + + /** Convenience converter from a stream to an ApplicationList */ private static ApplicationList listOf(Stream<Application> applications) { ImmutableList.Builder<Application> b = new ImmutableList.Builder<>(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java index 3e42fda73b3..b4708dccb6b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java @@ -13,21 +13,18 @@ import com.yahoo.vespa.hosted.controller.api.integration.organization.Deployment import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; import com.yahoo.vespa.hosted.controller.api.integration.organization.User; import com.yahoo.vespa.hosted.controller.application.ApplicationList; -import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; -import com.yahoo.vespa.hosted.controller.application.JobStatus; -import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import java.time.Duration; -import java.time.Instant; -import java.util.ArrayList; import java.util.Collection; -import java.util.Comparator; import java.util.List; import java.util.NoSuchElementException; import java.util.Optional; +import java.util.Set; import java.util.logging.Level; import java.util.stream.Collectors; +import static com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence.broken; + /** * Maintenance job which files issues for tenants when they have jobs which fails continuously * and escalates issues which are not handled in a timely manner. @@ -38,6 +35,7 @@ public class DeploymentIssueReporter extends Maintainer { static final Duration maxFailureAge = Duration.ofDays(2); static final Duration maxInactivity = Duration.ofDays(4); + static final Duration upgradeGracePeriod = Duration.ofHours(4); private final DeploymentIssues deploymentIssues; @@ -49,6 +47,7 @@ public class DeploymentIssueReporter extends Maintainer { @Override protected void maintain() { maintainDeploymentIssues(controller().applications().asList()); + maintainPlatformIssue(controller().applications().asList()); escalateInactiveDeploymentIssues(controller().applications().asList()); } @@ -58,41 +57,36 @@ public class DeploymentIssueReporter extends Maintainer { * where deployment has not failed for this amount of time. */ private void maintainDeploymentIssues(List<Application> applications) { - List<ApplicationId> failingApplications = new ArrayList<>(); + Set<ApplicationId> failingApplications = ApplicationList.from(applications) + .failingApplicationChangeSince(controller().clock().instant().minus(maxFailureAge)) + .asList().stream() + .map(Application::id) + .collect(Collectors.toSet()); + for (Application application : applications) - if (oldApplicationChangeFailuresIn(application.deploymentJobs())) - failingApplications.add(application.id()); + if (failingApplications.contains(application.id())) + fileDeploymentIssueFor(application.id()); else storeIssueId(application.id(), null); - - failingApplications.forEach(this::fileDeploymentIssueFor); - - if (controller().versionStatus().version(controller().systemVersion()).confidence() == VespaVersion.Confidence.broken) - deploymentIssues.fileUnlessOpen(ApplicationList.from(applications) - .upgradingTo(controller().systemVersion()) - .asList().stream() - .map(Application::id) - .collect(Collectors.toList()), - controller().systemVersion()); - } - - private boolean oldApplicationChangeFailuresIn(DeploymentJobs jobs) { - if (!jobs.hasFailures()) return false; - - Optional<Instant> oldestApplicationChangeFailure = jobs.jobStatus().values().stream() - .filter(job -> ! job.isSuccess() && failureCausedByApplicationChange(job)) - .map(job -> job.firstFailing().get().at()) - .min(Comparator.naturalOrder()); - - return oldestApplicationChangeFailure.isPresent() - && oldestApplicationChangeFailure.get().isBefore(controller().clock().instant().minus(maxFailureAge)); } - private boolean failureCausedByApplicationChange(JobStatus job) { - if ( ! job.lastSuccess().isPresent()) return true; // An application which never succeeded is surely bad. - if ( ! job.firstFailing().get().version().equals(job.lastSuccess().get().version())) return false; // Version change may be to blame. - if ( ! job.lastSuccess().get().revision().isPresent()) return true; // Indicates the component job, which is always an application change. - return ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision()); // Return whether there is an application change. + /** + * When the confidence for the system version is BROKEN, file an issue listing the + * applications that have been failing the upgrade to the system version for + * longer than the set grace period, or update this list if the issue already exists. + */ + private void maintainPlatformIssue(List<Application> applications) { + if ( ! (controller().versionStatus().version(controller().systemVersion()).confidence() == broken)) + return; + + List<ApplicationId> failingApplications = ApplicationList.from(applications) + .failingUpgradeToVersionSince(controller().systemVersion(), controller().clock().instant().minus(upgradeGracePeriod)) + .asList().stream() + .map(Application::id) + .collect(Collectors.toList()); + + if ( ! failingApplications.isEmpty()) + deploymentIssues.fileUnlessOpen(failingApplications, controller().systemVersion()); } private Tenant ownerOf(ApplicationId applicationId) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java index afafe4be82b..b5ea8e0a36f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java @@ -25,6 +25,7 @@ import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobTy import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest; import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxFailureAge; import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxInactivity; +import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.upgradeGracePeriod; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -157,6 +158,10 @@ public class DeploymentIssueReporterTest { assertFalse("We have no platform issues initially.", issues.platformIssue()); reporter.maintain(); reporter.maintain(); + assertFalse("We have no platform issue before the grace period is out for the failing canary.", issues.platformIssue()); + tester.clock().advance(upgradeGracePeriod.plus(upgradeGracePeriod)); + reporter.maintain(); + reporter.maintain(); assertTrue("We get a platform issue when confidence is broken", issues.platformIssue()); assertFalse("No deployment issue is filed for app2, which has a version upgrade failure.", issues.isOpenFor(app2.id())); |