diff options
author | Jon Bratseth <jonbratseth@yahoo.com> | 2017-10-27 15:58:12 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-27 15:58:12 +0200 |
commit | 2a8303d4a44321d54f86e03ce22264eeac6b644b (patch) | |
tree | e0bd1b4f814c241d53fae8c694026f1ffc620cbe | |
parent | e4b69755c3a31a0d4d6539a65a8ac8c828fab42c (diff) | |
parent | ba63ef828464e33ff4fc8cd75d608bc38e003f3b (diff) |
Merge pull request #3928 from vespa-engine/jvenstad/more-precise-deployment-issues
Jvenstad/more precise deployment issues
2 files changed, 65 insertions, 34 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java index 4ef92513393..48033645573 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java @@ -11,15 +11,21 @@ import com.yahoo.vespa.hosted.controller.api.identifiers.TenantId; import com.yahoo.vespa.hosted.controller.api.integration.organization.DeploymentIssues; import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; import com.yahoo.vespa.hosted.controller.api.integration.organization.User; +import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; +import com.yahoo.vespa.hosted.controller.application.JobStatus; +import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; import java.util.List; import java.util.NoSuchElementException; import java.util.Optional; import java.util.logging.Level; +import java.util.stream.Collectors; /** * Maintenance job which files issues for tenants when they have jobs which fails continuously @@ -53,23 +59,38 @@ public class DeploymentIssueReporter extends Maintainer { private void maintainDeploymentIssues(List<Application> applications) { List<ApplicationId> failingApplications = new ArrayList<>(); for (Application application : applications) - if (hasFailuresOlderThanThreshold(application.deploymentJobs())) + if (oldApplicationChangeFailuresIn(application.deploymentJobs())) failingApplications.add(application.id()); else controller().applications().setIssueId(application.id(), null); - // TODO: Change this logic, depending on the controller's definition of BROKEN, whether it updates applications - // TODO: to an older version when the system version is BROKEN, etc.. - if (failingApplications.size() > 0.2 * applications.size()) - deploymentIssues.fileUnlessOpen(failingApplications); - else - failingApplications.forEach(this::fileDeploymentIssueFor); + failingApplications.forEach(this::fileDeploymentIssueFor); + + if (controller().versionStatus().version(controller().systemVersion()).confidence() == VespaVersion.Confidence.broken) + deploymentIssues.fileUnlessOpen(ApplicationList.from(applications) + .upgradingTo(controller().systemVersion()) + .asList().stream() + .map(Application::id) + .collect(Collectors.toList())); + } + + private boolean oldApplicationChangeFailuresIn(DeploymentJobs jobs) { + if (!jobs.hasFailures()) return false; + + Optional<Instant> oldestApplicationChangeFailure = jobs.jobStatus().values().stream() + .filter(job -> ! job.isSuccess() && failureCausedByApplicationChange(job)) + .map(job -> job.firstFailing().get().at()) + .min(Comparator.naturalOrder()); + + return oldestApplicationChangeFailure.isPresent() + && oldestApplicationChangeFailure.get().isBefore(controller().clock().instant().minus(maxFailureAge)); } - /** Returns whether deploymentJobs has a job which has been failing since before failureThreshold. */ - private boolean hasFailuresOlderThanThreshold(DeploymentJobs deploymentJobs) { - return deploymentJobs.hasFailures() - && deploymentJobs.failingSince().isBefore(controller().clock().instant().minus(maxFailureAge)); + private boolean failureCausedByApplicationChange(JobStatus job) { + if ( ! job.lastSuccess().isPresent()) return true; // An application which never succeeded is surely bad. + if ( ! job.firstFailing().get().version().equals(job.lastSuccess().get().version())) return false; // Version change may be to blame. + if ( ! job.lastSuccess().get().revision().isPresent()) return true; // Indicates the component job, which is always an application change. + return ! job.firstFailing().get().revision().equals(job.lastSuccess().get().revision()); // Return whether there is an application change. } private Tenant ownerOf(ApplicationId applicationId) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java index aa93fb1cfe2..b4a2aa3fc19 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java @@ -1,15 +1,18 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Environment; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; import com.yahoo.vespa.hosted.controller.api.integration.stubs.LoggingDeploymentIssues; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; +import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.persistence.MockCuratorDb; +import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Before; import org.junit.Test; @@ -36,6 +39,12 @@ public class DeploymentIssueReporterTest { .environment(Environment.prod) .region("corp-us-east-1") .build(); + private final static ApplicationPackage canaryPackage = new ApplicationPackageBuilder() + .environment(Environment.prod) + .region("corp-us-east-1") + .upgradePolicy("canary") + .build(); + private DeploymentTester tester; private DeploymentIssueReporter reporter; @@ -55,36 +64,26 @@ public class DeploymentIssueReporterTest { Long projectId2 = 20L; Long projectId3 = 30L; - // Only the first two have propertyIds set now. Long propertyId1 = 1L; Long propertyId2 = 2L; Long propertyId3 = 3L; + tester.updateVersionStatus(Version.fromString("5.1")); + // Create and deploy one application for each of three tenants. Application app1 = tester.createApplication("application1", "tenant1", projectId1, propertyId1); Application app2 = tester.createApplication("application2", "tenant2", projectId2, propertyId2); Application app3 = tester.createApplication("application3", "tenant3", projectId3, propertyId3); - // And then we need lots of successful applications, so we won't assume we just have a faulty Vespa out. - for (long i = 4; i <= 10; i++) { - Application app = tester.createApplication("application" + i, "tenant" + i, 10 * i, i); - tester.notifyJobCompletion(component, app, true); - tester.deployAndNotify(app, applicationPackage, true, systemTest); - tester.deployAndNotify(app, applicationPackage, true, stagingTest); - tester.deployAndNotify(app, applicationPackage, true, productionCorpUsEast1); - } - // end of setup. - // NOTE: All maintenance should be idempotent within a small enough time interval, so maintain is called twice in succession throughout. - // app1 and app3 has one failure each. + // apps 1 and 3 have one failure each. tester.notifyJobCompletion(component, app1, true); tester.deployAndNotify(app1, applicationPackage, true, systemTest); tester.deployAndNotify(app1, applicationPackage, false, stagingTest); - tester.notifyJobCompletion(component, app2, true); - tester.deployAndNotify(app2, applicationPackage, true, systemTest); - tester.deployAndNotify(app2, applicationPackage, true, stagingTest); + // app2 is successful, but will fail later. + tester.deployCompletely(app2, canaryPackage); tester.notifyJobCompletion(component, app3, true); tester.deployAndNotify(app3, applicationPackage, true, systemTest); @@ -114,23 +113,16 @@ public class DeploymentIssueReporterTest { reporter.maintain(); assertTrue("Issue is re-filed for app3.", issues.isOpenFor(app3.id())); - // Some time passes; tenant1 leaves her issue unattended, while tenant3 starts work and updates the issue. - // app2 also has an intermittent failure; see that we detect this as a Vespa problem, and file an issue to ourselves. - tester.deployAndNotify(app2, applicationPackage, false, productionCorpUsEast1); tester.clock().advance(maxInactivity.plus(maxFailureAge)); issues.touchFor(app3.id()); - assertFalse("We have no platform issues initially.", issues.platformIssue()); reporter.maintain(); reporter.maintain(); assertEquals("The issue for app1 is escalated once.", 1, issues.escalationLevelFor(app1.id())); - assertTrue("We get a platform issue when more than 20% of applications are failing.", issues.platformIssue()); - assertFalse("No issue is filed for app2 while Vespa is considered broken.", issues.isOpenFor(app2.id())); - // app3 fixes its problem, but the ticket is left open; see the resolved ticket is not escalated when another escalation period has passed. - tester.deployAndNotify(app2, applicationPackage, true, productionCorpUsEast1); + // app3 fixes their problems, but the ticket for app3 is left open; see the resolved ticket is not escalated when another escalation period has passed. tester.deployAndNotify(app3, applicationPackage, true, productionCorpUsEast1); tester.clock().advance(maxInactivity.plus(Duration.ofDays(1))); @@ -151,6 +143,24 @@ public class DeploymentIssueReporterTest { reporter.maintain(); reporter.maintain(); assertTrue("A new issue is filed for app3.", issues.isOpenFor(app3.id())); + + + // Bump system version to 5.2 to upgrade canary app2. + Version version = Version.fromString("5.2"); + tester.updateVersionStatus(version); + assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); + tester.upgrader().maintain(); + + tester.completeUpgradeWithError(app2, version, canaryPackage, systemTest); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); + + assertFalse("We have no platform issues initially.", issues.platformIssue()); + reporter.maintain(); + reporter.maintain(); + assertTrue("We get a platform issue when confidence is broken", issues.platformIssue()); + assertFalse("No deployment issue is filed for app2, which has a version upgrade failure.", issues.isOpenFor(app2.id())); + } |