summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <jvenstad@yahoo-inc.com>2017-10-27 13:44:42 +0200
committerJon Marius Venstad <jvenstad@yahoo-inc.com>2017-10-27 13:44:42 +0200
commit0437066576fb3e69dea27106d8553548f29f0f87 (patch)
tree3904b7b542b8cf3be7b9394e8e0c6e417ec90fcd
parent23db8ec8a0c161414ec87604e3cbe8ca1a0c607e (diff)
Change criteria for issue filing
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java43
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java31
2 files changed, 49 insertions, 25 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
index 4ef92513393..48033645573 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
@@ -11,15 +11,21 @@ import com.yahoo.vespa.hosted.controller.api.identifiers.TenantId;
import com.yahoo.vespa.hosted.controller.api.integration.organization.DeploymentIssues;
import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId;
import com.yahoo.vespa.hosted.controller.api.integration.organization.User;
+import com.yahoo.vespa.hosted.controller.application.ApplicationList;
import com.yahoo.vespa.hosted.controller.application.DeploymentJobs;
+import com.yahoo.vespa.hosted.controller.application.JobStatus;
+import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import java.time.Duration;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.Comparator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.logging.Level;
+import java.util.stream.Collectors;
/**
* Maintenance job which files issues for tenants when they have jobs which fails continuously
@@ -53,23 +59,38 @@ public class DeploymentIssueReporter extends Maintainer {
private void maintainDeploymentIssues(List<Application> applications) {
List<ApplicationId> failingApplications = new ArrayList<>();
for (Application application : applications)
- if (hasFailuresOlderThanThreshold(application.deploymentJobs()))
+ if (oldApplicationChangeFailuresIn(application.deploymentJobs()))
failingApplications.add(application.id());
else
controller().applications().setIssueId(application.id(), null);
- // TODO: Change this logic, depending on the controller's definition of BROKEN, whether it updates applications
- // TODO: to an older version when the system version is BROKEN, etc..
- if (failingApplications.size() > 0.2 * applications.size())
- deploymentIssues.fileUnlessOpen(failingApplications);
- else
- failingApplications.forEach(this::fileDeploymentIssueFor);
+ failingApplications.forEach(this::fileDeploymentIssueFor);
+
+ if (controller().versionStatus().version(controller().systemVersion()).confidence() == VespaVersion.Confidence.broken)
+ deploymentIssues.fileUnlessOpen(ApplicationList.from(applications)
+ .upgradingTo(controller().systemVersion())
+ .asList().stream()
+ .map(Application::id)
+ .collect(Collectors.toList()));
+ }
+
+ private boolean oldApplicationChangeFailuresIn(DeploymentJobs jobs) {
+ if (!jobs.hasFailures()) return false;
+
+ Optional<Instant> oldestApplicationChangeFailure = jobs.jobStatus().values().stream()
+ .filter(job -> ! job.isSuccess() && failureCausedByApplicationChange(job))
+ .map(job -> job.firstFailing().get().at())
+ .min(Comparator.naturalOrder());
+
+ return oldestApplicationChangeFailure.isPresent()
+ && oldestApplicationChangeFailure.get().isBefore(controller().clock().instant().minus(maxFailureAge));
}
- /** Returns whether deploymentJobs has a job which has been failing since before failureThreshold. */
- private boolean hasFailuresOlderThanThreshold(DeploymentJobs deploymentJobs) {
- return deploymentJobs.hasFailures()
- && deploymentJobs.failingSince().isBefore(controller().clock().instant().minus(maxFailureAge));
+ private boolean failureCausedByApplicationChange(JobStatus job) {
+ if ( ! job.lastSuccess().isPresent()) return true; // An application which never succeeded is surely bad.
+ if ( ! job.firstFailing().get().version().equals(job.lastSuccess().get().version())) return false; // Version change may be to blame.
+ if ( ! job.lastSuccess().get().revision().isPresent()) return true; // Indicates the component job, which is always an application change.
+ return ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision()); // Return whether there is an application change.
}
private Tenant ownerOf(ApplicationId applicationId) {
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
index aa93fb1cfe2..99581273d84 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java
@@ -54,26 +54,19 @@ public class DeploymentIssueReporterTest {
Long projectId1 = 10L;
Long projectId2 = 20L;
Long projectId3 = 30L;
+ Long projectId4 = 40L;
// Only the first two have propertyIds set now.
Long propertyId1 = 1L;
Long propertyId2 = 2L;
Long propertyId3 = 3L;
+ Long propertyId4 = 4L;
// Create and deploy one application for each of three tenants.
Application app1 = tester.createApplication("application1", "tenant1", projectId1, propertyId1);
Application app2 = tester.createApplication("application2", "tenant2", projectId2, propertyId2);
Application app3 = tester.createApplication("application3", "tenant3", projectId3, propertyId3);
-
- // And then we need lots of successful applications, so we won't assume we just have a faulty Vespa out.
- for (long i = 4; i <= 10; i++) {
- Application app = tester.createApplication("application" + i, "tenant" + i, 10 * i, i);
- tester.notifyJobCompletion(component, app, true);
- tester.deployAndNotify(app, applicationPackage, true, systemTest);
- tester.deployAndNotify(app, applicationPackage, true, stagingTest);
- tester.deployAndNotify(app, applicationPackage, true, productionCorpUsEast1);
- }
- // end of setup.
+ Application app4 = tester.createApplication("application4", "tenant4", projectId4, propertyId4);
// NOTE: All maintenance should be idempotent within a small enough time interval, so maintain is called twice in succession throughout.
@@ -91,6 +84,10 @@ public class DeploymentIssueReporterTest {
tester.deployAndNotify(app3, applicationPackage, true, stagingTest);
tester.deployAndNotify(app3, applicationPackage, false, productionCorpUsEast1);
+ tester.notifyJobCompletion(component, app4, true);
+ tester.deployAndNotify(app4, applicationPackage, true, systemTest);
+ tester.deployAndNotify(app4, applicationPackage, true, stagingTest);
+
reporter.maintain();
reporter.maintain();
assertEquals("No deployments are detected as failing for a long time initially.", 0, issues.size());
@@ -116,22 +113,28 @@ public class DeploymentIssueReporterTest {
// Some time passes; tenant1 leaves her issue unattended, while tenant3 starts work and updates the issue.
- // app2 also has an intermittent failure; see that we detect this as a Vespa problem, and file an issue to ourselves.
+ // apps 2 and 2 also have intermittent failures; see that we detect this as a Vespa problem, and file an issue to ourselves.
tester.deployAndNotify(app2, applicationPackage, false, productionCorpUsEast1);
+ tester.deployAndNotify(app4, applicationPackage, false, productionCorpUsEast1);
tester.clock().advance(maxInactivity.plus(maxFailureAge));
issues.touchFor(app3.id());
assertFalse("We have no platform issues initially.", issues.platformIssue());
+ tester.updateVersionStatus();
reporter.maintain();
reporter.maintain();
+ System.err.println(tester.controller().versionStatus().version(tester.controller().systemVersion()).statistics().failing());
+ System.err.println(tester.controller().systemVersion());
+ System.err.println(tester.controller().applications().require(app1.id()).deploymentJobs().jobStatus().get(component).lastCompleted().get().version());
assertEquals("The issue for app1 is escalated once.", 1, issues.escalationLevelFor(app1.id()));
- assertTrue("We get a platform issue when more than 20% of applications are failing.", issues.platformIssue());
- assertFalse("No issue is filed for app2 while Vespa is considered broken.", issues.isOpenFor(app2.id()));
+ // assertTrue("We get a platform issue when confidence is broken", issues.platformIssue());
+ // assertFalse("No issue is filed for app2 while Vespa is considered broken.", issues.isOpenFor(app2.id()));
- // app3 fixes its problem, but the ticket is left open; see the resolved ticket is not escalated when another escalation period has passed.
+ // apps 2-4 fix their problems, but the ticket for app3 is left open; see the resolved ticket is not escalated when another escalation period has passed.
tester.deployAndNotify(app2, applicationPackage, true, productionCorpUsEast1);
tester.deployAndNotify(app3, applicationPackage, true, productionCorpUsEast1);
+ tester.deployAndNotify(app4, applicationPackage, true, productionCorpUsEast1);
tester.clock().advance(maxInactivity.plus(Duration.ofDays(1)));
reporter.maintain();