diff options
author | Jon Marius Venstad <jvenstad@yahoo-inc.com> | 2017-10-27 13:44:42 +0200 |
---|---|---|
committer | Jon Marius Venstad <jvenstad@yahoo-inc.com> | 2017-10-27 13:44:42 +0200 |
commit | 0437066576fb3e69dea27106d8553548f29f0f87 (patch) | |
tree | 3904b7b542b8cf3be7b9394e8e0c6e417ec90fcd | |
parent | 23db8ec8a0c161414ec87604e3cbe8ca1a0c607e (diff) |
Change criteria for issue filing
2 files changed, 49 insertions, 25 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java index 4ef92513393..48033645573 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java @@ -11,15 +11,21 @@ import com.yahoo.vespa.hosted.controller.api.identifiers.TenantId; import com.yahoo.vespa.hosted.controller.api.integration.organization.DeploymentIssues; import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; import com.yahoo.vespa.hosted.controller.api.integration.organization.User; +import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; +import com.yahoo.vespa.hosted.controller.application.JobStatus; +import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; import java.util.List; import java.util.NoSuchElementException; import java.util.Optional; import java.util.logging.Level; +import java.util.stream.Collectors; /** * Maintenance job which files issues for tenants when they have jobs which fails continuously @@ -53,23 +59,38 @@ public class DeploymentIssueReporter extends Maintainer { private void maintainDeploymentIssues(List<Application> applications) { List<ApplicationId> failingApplications = new ArrayList<>(); for (Application application : applications) - if (hasFailuresOlderThanThreshold(application.deploymentJobs())) + if (oldApplicationChangeFailuresIn(application.deploymentJobs())) failingApplications.add(application.id()); else controller().applications().setIssueId(application.id(), null); - // TODO: Change this logic, depending on the controller's definition of BROKEN, whether it updates applications - // TODO: to an older version when the system version is BROKEN, etc.. - if (failingApplications.size() > 0.2 * applications.size()) - deploymentIssues.fileUnlessOpen(failingApplications); - else - failingApplications.forEach(this::fileDeploymentIssueFor); + failingApplications.forEach(this::fileDeploymentIssueFor); + + if (controller().versionStatus().version(controller().systemVersion()).confidence() == VespaVersion.Confidence.broken) + deploymentIssues.fileUnlessOpen(ApplicationList.from(applications) + .upgradingTo(controller().systemVersion()) + .asList().stream() + .map(Application::id) + .collect(Collectors.toList())); + } + + private boolean oldApplicationChangeFailuresIn(DeploymentJobs jobs) { + if (!jobs.hasFailures()) return false; + + Optional<Instant> oldestApplicationChangeFailure = jobs.jobStatus().values().stream() + .filter(job -> ! job.isSuccess() && failureCausedByApplicationChange(job)) + .map(job -> job.firstFailing().get().at()) + .min(Comparator.naturalOrder()); + + return oldestApplicationChangeFailure.isPresent() + && oldestApplicationChangeFailure.get().isBefore(controller().clock().instant().minus(maxFailureAge)); } - /** Returns whether deploymentJobs has a job which has been failing since before failureThreshold. */ - private boolean hasFailuresOlderThanThreshold(DeploymentJobs deploymentJobs) { - return deploymentJobs.hasFailures() - && deploymentJobs.failingSince().isBefore(controller().clock().instant().minus(maxFailureAge)); + private boolean failureCausedByApplicationChange(JobStatus job) { + if ( ! job.lastSuccess().isPresent()) return true; // An application which never succeeded is surely bad. + if ( ! job.firstFailing().get().version().equals(job.lastSuccess().get().version())) return false; // Version change may be to blame. + if ( ! job.lastSuccess().get().revision().isPresent()) return true; // Indicates the component job, which is always an application change. + return ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision()); // Return whether there is an application change. } private Tenant ownerOf(ApplicationId applicationId) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java index aa93fb1cfe2..99581273d84 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java @@ -54,26 +54,19 @@ public class DeploymentIssueReporterTest { Long projectId1 = 10L; Long projectId2 = 20L; Long projectId3 = 30L; + Long projectId4 = 40L; // Only the first two have propertyIds set now. Long propertyId1 = 1L; Long propertyId2 = 2L; Long propertyId3 = 3L; + Long propertyId4 = 4L; // Create and deploy one application for each of three tenants. Application app1 = tester.createApplication("application1", "tenant1", projectId1, propertyId1); Application app2 = tester.createApplication("application2", "tenant2", projectId2, propertyId2); Application app3 = tester.createApplication("application3", "tenant3", projectId3, propertyId3); - - // And then we need lots of successful applications, so we won't assume we just have a faulty Vespa out. - for (long i = 4; i <= 10; i++) { - Application app = tester.createApplication("application" + i, "tenant" + i, 10 * i, i); - tester.notifyJobCompletion(component, app, true); - tester.deployAndNotify(app, applicationPackage, true, systemTest); - tester.deployAndNotify(app, applicationPackage, true, stagingTest); - tester.deployAndNotify(app, applicationPackage, true, productionCorpUsEast1); - } - // end of setup. + Application app4 = tester.createApplication("application4", "tenant4", projectId4, propertyId4); // NOTE: All maintenance should be idempotent within a small enough time interval, so maintain is called twice in succession throughout. @@ -91,6 +84,10 @@ public class DeploymentIssueReporterTest { tester.deployAndNotify(app3, applicationPackage, true, stagingTest); tester.deployAndNotify(app3, applicationPackage, false, productionCorpUsEast1); + tester.notifyJobCompletion(component, app4, true); + tester.deployAndNotify(app4, applicationPackage, true, systemTest); + tester.deployAndNotify(app4, applicationPackage, true, stagingTest); + reporter.maintain(); reporter.maintain(); assertEquals("No deployments are detected as failing for a long time initially.", 0, issues.size()); @@ -116,22 +113,28 @@ public class DeploymentIssueReporterTest { // Some time passes; tenant1 leaves her issue unattended, while tenant3 starts work and updates the issue. - // app2 also has an intermittent failure; see that we detect this as a Vespa problem, and file an issue to ourselves. + // apps 2 and 2 also have intermittent failures; see that we detect this as a Vespa problem, and file an issue to ourselves. tester.deployAndNotify(app2, applicationPackage, false, productionCorpUsEast1); + tester.deployAndNotify(app4, applicationPackage, false, productionCorpUsEast1); tester.clock().advance(maxInactivity.plus(maxFailureAge)); issues.touchFor(app3.id()); assertFalse("We have no platform issues initially.", issues.platformIssue()); + tester.updateVersionStatus(); reporter.maintain(); reporter.maintain(); + System.err.println(tester.controller().versionStatus().version(tester.controller().systemVersion()).statistics().failing()); + System.err.println(tester.controller().systemVersion()); + System.err.println(tester.controller().applications().require(app1.id()).deploymentJobs().jobStatus().get(component).lastCompleted().get().version()); assertEquals("The issue for app1 is escalated once.", 1, issues.escalationLevelFor(app1.id())); - assertTrue("We get a platform issue when more than 20% of applications are failing.", issues.platformIssue()); - assertFalse("No issue is filed for app2 while Vespa is considered broken.", issues.isOpenFor(app2.id())); + // assertTrue("We get a platform issue when confidence is broken", issues.platformIssue()); + // assertFalse("No issue is filed for app2 while Vespa is considered broken.", issues.isOpenFor(app2.id())); - // app3 fixes its problem, but the ticket is left open; see the resolved ticket is not escalated when another escalation period has passed. + // apps 2-4 fix their problems, but the ticket for app3 is left open; see the resolved ticket is not escalated when another escalation period has passed. tester.deployAndNotify(app2, applicationPackage, true, productionCorpUsEast1); tester.deployAndNotify(app3, applicationPackage, true, productionCorpUsEast1); + tester.deployAndNotify(app4, applicationPackage, true, productionCorpUsEast1); tester.clock().advance(maxInactivity.plus(Duration.ofDays(1))); reporter.maintain(); |