summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorJon Marius Venstad <jvenstad@yahoo-inc.com>2017-11-01 12:41:40 +0100
committerJon Marius Venstad <jvenstad@yahoo-inc.com>2017-11-01 12:41:40 +0100
commit15efaea249544917b178f7d227692792e9f6ca9d (patch)
tree935eec1221de37ae189cbbe11bc9982085502490 /controller-server
parentc4e121b78594bb81f894fcfff1cb1ab4b4b8edcb (diff)
List only failing applicatoin, with failures older than two hours
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java43
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java5
2 files changed, 34 insertions, 14 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
index 3e42fda73b3..031809ce699 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
@@ -25,8 +25,8 @@ import java.util.Comparator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional;
+import java.util.function.Predicate;
import java.util.logging.Level;
-import java.util.stream.Collectors;
/**
* Maintenance job which files issues for tenants when they have jobs which fails continuously
@@ -38,6 +38,7 @@ public class DeploymentIssueReporter extends Maintainer {
static final Duration maxFailureAge = Duration.ofDays(2);
static final Duration maxInactivity = Duration.ofDays(4);
+ static final Duration upgradeGracePeriod = Duration.ofHours(2);
private final DeploymentIssues deploymentIssues;
@@ -49,6 +50,7 @@ public class DeploymentIssueReporter extends Maintainer {
@Override
protected void maintain() {
maintainDeploymentIssues(controller().applications().asList());
+ maintainPlatformIssue(controller().applications().asList());
escalateInactiveDeploymentIssues(controller().applications().asList());
}
@@ -60,35 +62,48 @@ public class DeploymentIssueReporter extends Maintainer {
private void maintainDeploymentIssues(List<Application> applications) {
List<ApplicationId> failingApplications = new ArrayList<>();
for (Application application : applications)
- if (oldApplicationChangeFailuresIn(application.deploymentJobs()))
+ if (oldFailuresIn(application.deploymentJobs(), this::causedByApplicationChange, maxFailureAge))
failingApplications.add(application.id());
else
storeIssueId(application.id(), null);
failingApplications.forEach(this::fileDeploymentIssueFor);
- if (controller().versionStatus().version(controller().systemVersion()).confidence() == VespaVersion.Confidence.broken)
- deploymentIssues.fileUnlessOpen(ApplicationList.from(applications)
- .upgradingTo(controller().systemVersion())
- .asList().stream()
- .map(Application::id)
- .collect(Collectors.toList()),
- controller().systemVersion());
}
- private boolean oldApplicationChangeFailuresIn(DeploymentJobs jobs) {
- if (!jobs.hasFailures()) return false;
+ /**
+ * When the confidence for the system version is BROKEN, file an issue listing the
+ * applications that have been failing the upgrade to the system version for
+ * longer than the set grace period, or update this list if the issue already exists.
+ */
+ private void maintainPlatformIssue(List<Application> applications) {
+ if ( ! (controller().versionStatus().version(controller().systemVersion()).confidence() == VespaVersion.Confidence.broken))
+ return;
+
+ List<ApplicationId> failingApplications = new ArrayList<>();
+ for (Application application : ApplicationList.from(applications).upgradingTo(controller().systemVersion()).asList())
+ if (oldFailuresIn(application.deploymentJobs(), job -> ! causedByApplicationChange(job), upgradeGracePeriod))
+ failingApplications.add(application.id());
+
+ if ( ! failingApplications.isEmpty())
+ deploymentIssues.fileUnlessOpen(failingApplications, controller().systemVersion());
+ }
+
+ /** Return whether the given deployment jobs contain failures due to the given cause, past the given age. */
+ private boolean oldFailuresIn(DeploymentJobs jobs, Predicate<JobStatus> failureCause, Duration maxAge) {
+ if ( ! jobs.hasFailures()) return false;
Optional<Instant> oldestApplicationChangeFailure = jobs.jobStatus().values().stream()
- .filter(job -> ! job.isSuccess() && failureCausedByApplicationChange(job))
+ .filter(job -> ! job.isSuccess())
+ .filter(failureCause)
.map(job -> job.firstFailing().get().at())
.min(Comparator.naturalOrder());
return oldestApplicationChangeFailure.isPresent()
- && oldestApplicationChangeFailure.get().isBefore(controller().clock().instant().minus(maxFailureAge));
+ && oldestApplicationChangeFailure.get().isBefore(controller().clock().instant().minus(maxAge));
}
- private boolean failureCausedByApplicationChange(JobStatus job) {
+ private boolean causedByApplicationChange(JobStatus job) {
if ( ! job.lastSuccess().isPresent()) return true; // An application which never succeeded is surely bad.
if ( ! job.firstFailing().get().version().equals(job.lastSuccess().get().version())) return false; // Version change may be to blame.
if ( ! job.lastSuccess().get().revision().isPresent()) return true; // Indicates the component job, which is always an application change.
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
index afafe4be82b..b5ea8e0a36f 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
@@ -25,6 +25,7 @@ import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobTy
import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest;
import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxFailureAge;
import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxInactivity;
+import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.upgradeGracePeriod;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -157,6 +158,10 @@ public class DeploymentIssueReporterTest {
assertFalse("We have no platform issues initially.", issues.platformIssue());
reporter.maintain();
reporter.maintain();
+ assertFalse("We have no platform issue before the grace period is out for the failing canary.", issues.platformIssue());
+ tester.clock().advance(upgradeGracePeriod.plus(upgradeGracePeriod));
+ reporter.maintain();
+ reporter.maintain();
assertTrue("We get a platform issue when confidence is broken", issues.platformIssue());
assertFalse("No deployment issue is filed for app2, which has a version upgrade failure.", issues.isOpenFor(app2.id()));