summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <jonmv@users.noreply.github.com>2017-11-02 09:05:24 +0100
committerGitHub <noreply@github.com>2017-11-02 09:05:24 +0100
commit68396f1ed32d73a145d1cb1413a92f1b53adf937 (patch)
tree4d67895eaaa01067bfbe6327a8b8f10e668983a0
parentf8129d907a822ebf5e30978f63db261a9fdb2af8 (diff)
parent4df8bd86a3d0d5e93c81c89d25bd4ef1ec837940 (diff)
Merge pull request #3963 from vespa-engine/jvenstad/correct-platform-issue-filing
List only failing applicatoin, with failures older than four hours
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java43
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java66
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java5
3 files changed, 76 insertions, 38 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
index 785aeefc57f..a0d038c0e8d 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
@@ -5,7 +5,6 @@ import com.google.common.collect.ImmutableList;
import com.yahoo.component.Version;
import com.yahoo.config.application.api.DeploymentSpec.UpgradePolicy;
import com.yahoo.config.provision.ApplicationId;
-import com.yahoo.config.provision.Environment;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.ApplicationController;
@@ -73,6 +72,16 @@ public class ApplicationList {
return listOf(list.stream().filter(application -> ! application.deploymentJobs().hasFailures()));
}
+ /** Returns the subset of applications which have been failing an upgrade to the given version since the given instant */
+ public ApplicationList failingUpgradeToVersionSince(Version version, Instant threshold) {
+ return listOf(list.stream().filter(application -> failingUpgradeToVersionSince(application, version, threshold)));
+ }
+
+ /** Returns the subset of applications which have been failing an application change since the given instant */
+ public ApplicationList failingApplicationChangeSince(Instant threshold) {
+ return listOf(list.stream().filter(application -> failingApplicationChangeSince(application, threshold)));
+ }
+
/** Returns the subset of applications which currently does not have any failing jobs on the given version */
public ApplicationList notFailingOn(Version version) {
return listOf(list.stream().filter(application -> ! failingOn(version, application)));
@@ -177,7 +186,37 @@ public class ApplicationList {
.map(status -> status.lastTriggered().get())
.anyMatch(jobRun -> jobRun.version().equals(change.version()));
}
-
+
+ private static boolean failingUpgradeToVersionSince(Application application, Version version, Instant threshold) {
+ return application.deploymentJobs().jobStatus().values().stream()
+ .filter(job -> isUpgradeFailure(job))
+ .filter(job -> job.firstFailing().get().at().isBefore(threshold))
+ .anyMatch(job -> job.lastCompleted().get().version().equals(version));
+ }
+
+ private static boolean failingApplicationChangeSince(Application application, Instant threshold) {
+ return application.deploymentJobs().jobStatus().values().stream()
+ .filter(job -> isApplicationChangeFailure(job))
+ .anyMatch(job -> job.firstFailing().get().at().isBefore(threshold));
+ }
+
+ private static boolean isUpgradeFailure(JobStatus job) {
+ if ( job.isSuccess()) return false;
+ if ( ! job.lastSuccess().isPresent()) return false; // An application which never succeeded is surely bad.
+ if ( ! job.lastSuccess().get().revision().isPresent()) return false; // Indicates the component job, which is not an upgrade.
+ if ( ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision())) return false; // Application change may be to blame.
+ return ! job.firstFailing().get().version().equals(job.lastSuccess().get().version()); // Return whether there is a version change.
+ }
+
+ private static boolean isApplicationChangeFailure(JobStatus job) {
+ if ( job.isSuccess()) return false;
+ if ( ! job.lastSuccess().isPresent()) return true; // An application which never succeeded is surely bad.
+ if ( ! job.lastSuccess().get().revision().isPresent()) return true; // Indicates the component job, which is always an application change.
+ if ( ! job.firstFailing().get().version().equals(job.lastSuccess().get().version())) return false; // Version change may be to blame.
+ return ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision()); // Return whether there is an application change.
+ }
+
+
/** Convenience converter from a stream to an ApplicationList */
private static ApplicationList listOf(Stream<Application> applications) {
ImmutableList.Builder<Application> b = new ImmutableList.Builder<>();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
index 3e42fda73b3..b4708dccb6b 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
@@ -13,21 +13,18 @@ import com.yahoo.vespa.hosted.controller.api.integration.organization.Deployment
import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId;
import com.yahoo.vespa.hosted.controller.api.integration.organization.User;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
-import com.yahoo.vespa.hosted.controller.application.DeploymentJobs;
-import com.yahoo.vespa.hosted.controller.application.JobStatus;
-import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import java.time.Duration;
-import java.time.Instant;
-import java.util.ArrayList;
import java.util.Collection;
-import java.util.Comparator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional;
+import java.util.Set;
import java.util.logging.Level;
import java.util.stream.Collectors;
+import static com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence.broken;
+
/**
* Maintenance job which files issues for tenants when they have jobs which fails continuously
* and escalates issues which are not handled in a timely manner.
@@ -38,6 +35,7 @@ public class DeploymentIssueReporter extends Maintainer {
static final Duration maxFailureAge = Duration.ofDays(2);
static final Duration maxInactivity = Duration.ofDays(4);
+ static final Duration upgradeGracePeriod = Duration.ofHours(4);
private final DeploymentIssues deploymentIssues;
@@ -49,6 +47,7 @@ public class DeploymentIssueReporter extends Maintainer {
@Override
protected void maintain() {
maintainDeploymentIssues(controller().applications().asList());
+ maintainPlatformIssue(controller().applications().asList());
escalateInactiveDeploymentIssues(controller().applications().asList());
}
@@ -58,41 +57,36 @@ public class DeploymentIssueReporter extends Maintainer {
* where deployment has not failed for this amount of time.
*/
private void maintainDeploymentIssues(List<Application> applications) {
- List<ApplicationId> failingApplications = new ArrayList<>();
+ Set<ApplicationId> failingApplications = ApplicationList.from(applications)
+ .failingApplicationChangeSince(controller().clock().instant().minus(maxFailureAge))
+ .asList().stream()
+ .map(Application::id)
+ .collect(Collectors.toSet());
+
for (Application application : applications)
- if (oldApplicationChangeFailuresIn(application.deploymentJobs()))
- failingApplications.add(application.id());
+ if (failingApplications.contains(application.id()))
+ fileDeploymentIssueFor(application.id());
else
storeIssueId(application.id(), null);
-
- failingApplications.forEach(this::fileDeploymentIssueFor);
-
- if (controller().versionStatus().version(controller().systemVersion()).confidence() == VespaVersion.Confidence.broken)
- deploymentIssues.fileUnlessOpen(ApplicationList.from(applications)
- .upgradingTo(controller().systemVersion())
- .asList().stream()
- .map(Application::id)
- .collect(Collectors.toList()),
- controller().systemVersion());
- }
-
- private boolean oldApplicationChangeFailuresIn(DeploymentJobs jobs) {
- if (!jobs.hasFailures()) return false;
-
- Optional<Instant> oldestApplicationChangeFailure = jobs.jobStatus().values().stream()
- .filter(job -> ! job.isSuccess() && failureCausedByApplicationChange(job))
- .map(job -> job.firstFailing().get().at())
- .min(Comparator.naturalOrder());
-
- return oldestApplicationChangeFailure.isPresent()
- && oldestApplicationChangeFailure.get().isBefore(controller().clock().instant().minus(maxFailureAge));
}
- private boolean failureCausedByApplicationChange(JobStatus job) {
- if ( ! job.lastSuccess().isPresent()) return true; // An application which never succeeded is surely bad.
- if ( ! job.firstFailing().get().version().equals(job.lastSuccess().get().version())) return false; // Version change may be to blame.
- if ( ! job.lastSuccess().get().revision().isPresent()) return true; // Indicates the component job, which is always an application change.
- return ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision()); // Return whether there is an application change.
+ /**
+ * When the confidence for the system version is BROKEN, file an issue listing the
+ * applications that have been failing the upgrade to the system version for
+ * longer than the set grace period, or update this list if the issue already exists.
+ */
+ private void maintainPlatformIssue(List<Application> applications) {
+ if ( ! (controller().versionStatus().version(controller().systemVersion()).confidence() == broken))
+ return;
+
+ List<ApplicationId> failingApplications = ApplicationList.from(applications)
+ .failingUpgradeToVersionSince(controller().systemVersion(), controller().clock().instant().minus(upgradeGracePeriod))
+ .asList().stream()
+ .map(Application::id)
+ .collect(Collectors.toList());
+
+ if ( ! failingApplications.isEmpty())
+ deploymentIssues.fileUnlessOpen(failingApplications, controller().systemVersion());
}
private Tenant ownerOf(ApplicationId applicationId) {
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
index afafe4be82b..b5ea8e0a36f 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
@@ -25,6 +25,7 @@ import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobTy
import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest;
import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxFailureAge;
import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxInactivity;
+import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.upgradeGracePeriod;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -157,6 +158,10 @@ public class DeploymentIssueReporterTest {
assertFalse("We have no platform issues initially.", issues.platformIssue());
reporter.maintain();
reporter.maintain();
+ assertFalse("We have no platform issue before the grace period is out for the failing canary.", issues.platformIssue());
+ tester.clock().advance(upgradeGracePeriod.plus(upgradeGracePeriod));
+ reporter.maintain();
+ reporter.maintain();
assertTrue("We get a platform issue when confidence is broken", issues.platformIssue());
assertFalse("No deployment issue is filed for app2, which has a version upgrade failure.", issues.isOpenFor(app2.id()));