diff options
author | Jon Marius Venstad <jvenstad@yahoo-inc.com> | 2017-10-20 09:58:59 +0200 |
---|---|---|
committer | Jon Marius Venstad <jvenstad@yahoo-inc.com> | 2017-10-20 09:58:59 +0200 |
commit | e5e197ec9390033da499cebfb68ba92ac74cb17b (patch) | |
tree | bce37c26ab829ec8db428f72b5aef822cedc16b3 /controller-server/src | |
parent | b41d2e64fddaaba2763db313423652dfd4d0912c (diff) |
Refactored deployment issues >_<
Diffstat (limited to 'controller-server/src')
10 files changed, 147 insertions, 356 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java index f68ce3ebfa5..54b37c02787 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java @@ -8,6 +8,7 @@ import com.yahoo.config.application.api.ValidationOverrides; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Zone; +import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; import com.yahoo.vespa.hosted.controller.application.ApplicationRevision; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.Deployment; @@ -129,8 +130,8 @@ public class Application { return new Application(id, deploymentSpec, validationOverrides, deployments, deploymentJobs.withProjectId(projectId), deploying, outstandingChange); } - public Application withJiraIssueId(Optional<String> jiraIssueId) { - return new Application(id, deploymentSpec, validationOverrides, deployments, deploymentJobs.withJiraIssueId(jiraIssueId), deploying, outstandingChange); + public Application withIssueId(IssueId issueId) { + return new Application(id, deploymentSpec, validationOverrides, deployments, deploymentJobs.withIssueId(issueId), deploying, outstandingChange); } public Application withJobCompletion(JobReport report, Instant notificationTime, Controller controller) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java index 2ce2e480fc5..42a85033bd6 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java @@ -22,6 +22,7 @@ import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; import com.yahoo.vespa.hosted.controller.api.identifiers.Hostname; import com.yahoo.vespa.hosted.controller.api.identifiers.RevisionId; import com.yahoo.vespa.hosted.controller.api.identifiers.TenantId; +import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; import com.yahoo.vespa.hosted.controller.api.integration.athens.NToken; import com.yahoo.vespa.hosted.controller.api.integration.athens.ZmsClient; import com.yahoo.vespa.hosted.controller.api.integration.athens.ZmsClientFactory; @@ -495,9 +496,9 @@ public class ApplicationController { } } - public void setJiraIssueId(ApplicationId id, Optional<String> jiraIssueId) { + public void setIssueId(ApplicationId id, IssueId issueId) { try (Lock lock = lock(id)) { - get(id).ifPresent(application -> store(application.withJiraIssueId(jiraIssueId), lock)); + get(id).ifPresent(application -> store(application.withIssueId(issueId), lock)); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Controller.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Controller.java index 68912ac55ef..4128e64f53e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Controller.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Controller.java @@ -23,7 +23,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.cost.Cost; import com.yahoo.vespa.hosted.controller.api.integration.dns.NameService; import com.yahoo.vespa.hosted.controller.api.integration.entity.EntityService; import com.yahoo.vespa.hosted.controller.api.integration.github.GitHub; -import com.yahoo.vespa.hosted.controller.api.integration.jira.Jira; +import com.yahoo.vespa.hosted.controller.api.integration.organization.Organization; import com.yahoo.vespa.hosted.controller.api.integration.routing.GlobalRoutingService; import com.yahoo.vespa.hosted.controller.api.integration.routing.RotationStatus; import com.yahoo.vespa.hosted.controller.api.integration.routing.RoutingGenerator; @@ -96,19 +96,19 @@ public class Controller extends AbstractComponent { */ @Inject public Controller(ControllerDb db, CuratorDb curator, RotationRepository rotationRepository, - GitHub gitHub, Jira jiraClient, EntityService entityService, + GitHub gitHub, EntityService entityService, GlobalRoutingService globalRoutingService, ZoneRegistry zoneRegistry, Cost cost, ConfigServerClient configServerClient, MetricsService metricsService, NameService nameService, RoutingGenerator routingGenerator, Chef chefClient, Athens athens) { this(db, curator, rotationRepository, - gitHub, jiraClient, entityService, globalRoutingService, zoneRegistry, + gitHub, entityService, globalRoutingService, zoneRegistry, cost, configServerClient, metricsService, nameService, routingGenerator, chefClient, Clock.systemUTC(), athens); } public Controller(ControllerDb db, CuratorDb curator, RotationRepository rotationRepository, - GitHub gitHub, Jira jiraClient, EntityService entityService, + GitHub gitHub, EntityService entityService, GlobalRoutingService globalRoutingService, ZoneRegistry zoneRegistry, Cost cost, ConfigServerClient configServerClient, MetricsService metricsService, NameService nameService, @@ -117,7 +117,6 @@ public class Controller extends AbstractComponent { Objects.requireNonNull(curator, "Curator cannot be null"); Objects.requireNonNull(rotationRepository, "Rotation repository cannot be null"); Objects.requireNonNull(gitHub, "GitHubClient cannot be null"); - Objects.requireNonNull(jiraClient, "JiraClient cannot be null"); Objects.requireNonNull(entityService, "EntityService cannot be null"); Objects.requireNonNull(globalRoutingService, "GlobalRoutingService cannot be null"); Objects.requireNonNull(zoneRegistry, "ZoneRegistry cannot be null"); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java index 26bef06adcf..ea44b32fbda 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java @@ -9,6 +9,7 @@ import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.controller.Controller; +import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; import java.time.Instant; import java.util.Collection; @@ -30,20 +31,20 @@ public class DeploymentJobs { private final Optional<Long> projectId; private final ImmutableMap<JobType, JobStatus> status; - private final Optional<String> jiraIssueId; + private final Optional<IssueId> issueId; public DeploymentJobs(Optional<Long> projectId, Collection<JobStatus> jobStatusEntries, - Optional<String> jiraIssueId) { - this(projectId, asMap(jobStatusEntries), jiraIssueId); + Optional<IssueId> issueId) { + this(projectId, asMap(jobStatusEntries), issueId); } - private DeploymentJobs(Optional<Long> projectId, Map<JobType, JobStatus> status, Optional<String> jiraIssueId) { - requireId(projectId, "projectId cannot be null or <= 0"); + private DeploymentJobs(Optional<Long> projectId, Map<JobType, JobStatus> status, Optional<IssueId> issueId) { + requireId(projectId, "projectId must be a positive integer"); Objects.requireNonNull(status, "status cannot be null"); - Objects.requireNonNull(jiraIssueId, "jiraIssueId cannot be null"); + Objects.requireNonNull(issueId, "issueId cannot be null"); this.projectId = projectId; this.status = ImmutableMap.copyOf(status); - this.jiraIssueId = jiraIssueId; + this.issueId = issueId; } private static Map<JobType, JobStatus> asMap(Collection<JobStatus> jobStatusEntries) { @@ -60,7 +61,7 @@ public class DeploymentJobs { if (job == null) job = JobStatus.initial(report.jobType()); return job.withCompletion(report.jobError(), notificationTime, controller); }); - return new DeploymentJobs(Optional.of(report.projectId()), status, jiraIssueId); + return new DeploymentJobs(Optional.of(report.projectId()), status, issueId); } public DeploymentJobs withTriggering(JobType jobType, @@ -75,21 +76,21 @@ public class DeploymentJobs { change.isPresent() && change.get() instanceof Change.VersionChange, triggerTime); }); - return new DeploymentJobs(projectId, status, jiraIssueId); + return new DeploymentJobs(projectId, status, issueId); } public DeploymentJobs withProjectId(long projectId) { - return new DeploymentJobs(Optional.of(projectId), status, jiraIssueId); + return new DeploymentJobs(Optional.of(projectId), status, issueId); } - public DeploymentJobs withJiraIssueId(Optional<String> jiraIssueId) { - return new DeploymentJobs(projectId, status, jiraIssueId); + public DeploymentJobs withIssueId(IssueId issueId) { + return new DeploymentJobs(projectId, status, Optional.ofNullable(issueId)); } public DeploymentJobs without(JobType job) { Map<JobType, JobStatus> status = new HashMap<>(this.status); status.remove(job); - return new DeploymentJobs(projectId, status, jiraIssueId); + return new DeploymentJobs(projectId, status, issueId); } /** Returns an immutable map of the status entries in this */ @@ -154,7 +155,7 @@ public class DeploymentJobs { */ public Optional<Long> projectId() { return projectId; } - public Optional<String> jiraIssueId() { return jiraIssueId; } + public Optional<IssueId> issueId() { return issueId; } /** Job types that exist in the build system */ public enum JobType { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java index 1b692ecf243..d20773d8882 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java @@ -4,9 +4,7 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.component.AbstractComponent; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.controller.Controller; -import com.yahoo.vespa.hosted.controller.api.integration.Contacts; -import com.yahoo.vespa.hosted.controller.api.integration.Issues; -import com.yahoo.vespa.hosted.controller.api.integration.Properties; +import com.yahoo.vespa.hosted.controller.api.integration.organization.DeploymentIssues; import com.yahoo.vespa.hosted.controller.api.integration.chef.Chef; import com.yahoo.vespa.hosted.controller.maintenance.config.MaintainerConfig; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; @@ -36,12 +34,11 @@ public class ControllerMaintenance extends AbstractComponent { @SuppressWarnings("unused") // instantiated by Dependency Injection public ControllerMaintenance(MaintainerConfig maintainerConfig, Controller controller, CuratorDb curator, JobControl jobControl, Metric metric, Chef chefClient, - Contacts contactsClient, Properties propertiesClient, Issues issuesClient) { + DeploymentIssues deploymentIssues) { Duration maintenanceInterval = Duration.ofMinutes(maintainerConfig.intervalMinutes()); this.jobControl = jobControl; deploymentExpirer = new DeploymentExpirer(controller, maintenanceInterval, jobControl); - deploymentIssueReporter = new DeploymentIssueReporter(controller, contactsClient, propertiesClient, - issuesClient, maintenanceInterval, jobControl); + deploymentIssueReporter = new DeploymentIssueReporter(controller, deploymentIssues, maintenanceInterval, jobControl); metricsReporter = new MetricsReporter(controller, metric, chefClient, jobControl, controller.system()); failureRedeployer = new FailureRedeployer(controller, maintenanceInterval, jobControl); outstandingChangeDeployer = new OutstandingChangeDeployer(controller, maintenanceInterval, jobControl); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java index 48e67a148b5..4ef92513393 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java @@ -8,52 +8,35 @@ import com.yahoo.vespa.hosted.controller.api.Tenant; import com.yahoo.vespa.hosted.controller.api.application.v4.model.TenantType; import com.yahoo.vespa.hosted.controller.api.identifiers.PropertyId; import com.yahoo.vespa.hosted.controller.api.identifiers.TenantId; -import com.yahoo.vespa.hosted.controller.api.integration.Contacts; -import com.yahoo.vespa.hosted.controller.api.integration.Contacts.UserContact; -import com.yahoo.vespa.hosted.controller.api.integration.Issues; -import com.yahoo.vespa.hosted.controller.api.integration.Issues.Classification; -import com.yahoo.vespa.hosted.controller.api.integration.Issues.Issue; -import com.yahoo.vespa.hosted.controller.api.integration.Issues.IssueInfo; -import com.yahoo.vespa.hosted.controller.api.integration.Properties; +import com.yahoo.vespa.hosted.controller.api.integration.organization.DeploymentIssues; +import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; +import com.yahoo.vespa.hosted.controller.api.integration.organization.User; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import java.time.Duration; -import java.time.Instant; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import static com.yahoo.vespa.hosted.controller.api.integration.Contacts.Category.admin; -import static com.yahoo.vespa.hosted.controller.api.integration.Issues.IssueInfo.Status.done; +import java.util.logging.Level; /** - * Maintenance job which creates Jira issues for tenants when they have jobs which fails continuously - * and escalates issues which are not handled. + * Maintenance job which files issues for tenants when they have jobs which fails continuously + * and escalates issues which are not handled in a timely manner. * * @author jvenstad */ public class DeploymentIssueReporter extends Maintainer { static final Duration maxFailureAge = Duration.ofDays(2); - static final Duration maxInactivityAge = Duration.ofDays(4); - static final String deploymentFailureLabel = "vespaDeploymentFailure"; - static final Classification vespaOps = new Classification("VESPA", "Services", deploymentFailureLabel, null); - static final UserContact terminalUser = new UserContact("frodelu", "Frode Lundgren", admin); + static final Duration maxInactivity = Duration.ofDays(4); - private final Contacts contacts; - private final Properties properties; - private final Issues issues; + private final DeploymentIssues deploymentIssues; - DeploymentIssueReporter(Controller controller, Contacts contacts, Properties properties, Issues issues, - Duration maintenanceInterval, JobControl jobControl) { + DeploymentIssueReporter(Controller controller, DeploymentIssues deploymentIssues, Duration maintenanceInterval, JobControl jobControl) { super(controller, maintenanceInterval, jobControl); - this.contacts = contacts; - this.properties = properties; - this.issues = issues; + this.deploymentIssues = deploymentIssues; } @Override @@ -63,183 +46,71 @@ public class DeploymentIssueReporter extends Maintainer { } /** - * File issues for applications which have failed deployment for longer than @maxFailureAge - * and store the issue id for the filed issues. Also, clear the @issueIds of applications + * File issues for applications which have failed deployment for longer than maxFailureAge + * and store the issue id for the filed issues. Also, clear the issueIds of applications * where deployment has not failed for this amount of time. */ private void maintainDeploymentIssues(List<Application> applications) { - Collection<Application> failingApplications = new ArrayList<>(); + List<ApplicationId> failingApplications = new ArrayList<>(); for (Application application : applications) - if (failingSinceBefore(application.deploymentJobs(), controller().clock().instant().minus(maxFailureAge))) - failingApplications.add(application); + if (hasFailuresOlderThanThreshold(application.deploymentJobs())) + failingApplications.add(application.id()); else - controller().applications().setJiraIssueId(application.id(), Optional.empty()); - - // TODO: Do this when version.confidence is BROKEN instead? Or, exclude above those upgrading to BROKEN version? - if (failingApplications.size() > 0.2 * applications.size()) { - fileOrUpdate(manyFailingDeploymentsIssueFrom(failingApplications)); // Problems with Vespa is the most likely cause when so many deployments fail. - } - else { - for (Application application : failingApplications) { - Issue deploymentIssue = deploymentIssueFrom(application); - Tenant applicationTenant = null; - Classification applicationOwner = null; - try { - applicationTenant= ownerOf(application); - applicationOwner = applicationTenant.tenantType() == TenantType.USER - ? vespaOps.withAssignee(applicationTenant.getId().id().replaceFirst("by-", "")) - : jiraClassificationOf(applicationTenant); - fileFor(application, deploymentIssue.with(applicationOwner)); - } - catch (RuntimeException e) { // Catch errors due to inconsistent or missing data in Sherpa, OpsDB, JIRA, and send to ourselves. - Pattern componentError = Pattern.compile(".*Component name '.*' is not valid.*", Pattern.DOTALL); - if (componentError.matcher(e.getMessage()).matches()) // Several properties seem to list invalid components, in which case we simply ignore this. - fileFor(application, - deploymentIssue - .with(applicationOwner.withComponent(null)) - .append("\n\nNote: The 'Queue Component' field in [opsdb|https://opsdb.ops.yahoo.com/properties.php?id=" + - applicationTenant.getPropertyId().get() + - "&action=view] for your property was rejected by JIRA. Please check your spelling.")); - else - fileFor(application, deploymentIssue.with(vespaOps).append(e.getMessage() + "\n\nAddressee:\n" + applicationOwner)); - } - } - } - } - - /** Returns whether @deploymentJobs has a job which has been failing since before @failureThreshold or not. */ - private boolean failingSinceBefore(DeploymentJobs deploymentJobs, Instant failureThreshold) { - return deploymentJobs.hasFailures() && deploymentJobs.failingSince().isBefore(failureThreshold); - } - - private Tenant ownerOf(Application application) { - return controller().tenants().tenant(new TenantId(application.id().tenant().value())).get(); - } - - /** Use the @propertyId of @tenant, if present, to look up JIRA information in OpsDB. */ - private Classification jiraClassificationOf(Tenant tenant) { - Long propertyId = tenant.getPropertyId().map(PropertyId::value).orElseThrow(() -> - new NoSuchElementException("No property id is listed for " + tenant)); - - Classification classification = properties.classificationFor(propertyId).orElseThrow(() -> - new NoSuchElementException("No property was found with id " + propertyId)); - - return classification.withLabel(deploymentFailureLabel); - } - - /** File @issue for @application, if @application doesn't already have an @Issue associated with it. */ - private void fileFor(Application application, Issue issue) { - Optional<String> ourIssueId = application.deploymentJobs().jiraIssueId() - .filter(jiraIssueId -> issues.fetch(jiraIssueId).status() != done); - - if ( ! ourIssueId.isPresent()) - controller().applications().setJiraIssueId(application.id(), Optional.of(issues.file(issue))); - } - - /** File @issue, or update a JIRA issue representing the same issue. */ - private void fileOrUpdate(Issue issue) { - Optional<String> jiraIssueId = issues.fetchSimilarTo(issue) - .stream().findFirst().map(Issues.IssueInfo::id); + controller().applications().setIssueId(application.id(), null); - if (jiraIssueId.isPresent()) - issues.update(jiraIssueId.get(), issue.description()); + // TODO: Change this logic, depending on the controller's definition of BROKEN, whether it updates applications + // TODO: to an older version when the system version is BROKEN, etc.. + if (failingApplications.size() > 0.2 * applications.size()) + deploymentIssues.fileUnlessOpen(failingApplications); else - issues.file(issue); + failingApplications.forEach(this::fileDeploymentIssueFor); } - /** Escalate JIRA issues for which there has been no activity for a set amount of time. */ - private void escalateInactiveDeploymentIssues(List<Application> applications) { - applications.forEach(application -> - application.deploymentJobs().jiraIssueId().ifPresent(jiraIssueId -> { - Issues.IssueInfo issueInfo = issues.fetch(jiraIssueId); - // TODO: Consider a different check here; no comments or reassignments, for instance. - if (issueInfo.updated().isBefore(controller().clock().instant().minus(maxInactivityAge))) - escalateAndComment(issueInfo, application); - })); + /** Returns whether deploymentJobs has a job which has been failing since before failureThreshold. */ + private boolean hasFailuresOlderThanThreshold(DeploymentJobs deploymentJobs) { + return deploymentJobs.hasFailures() + && deploymentJobs.failingSince().isBefore(controller().clock().instant().minus(maxFailureAge)); } - /** Reassign the JIRA issue for @application one step up in the OpsDb escalation chain, and add an explanatory comment to it. */ - private void escalateAndComment(IssueInfo issueInfo, Application application) { - Optional<String> assignee = issueInfo.assignee(); - if (assignee.isPresent()) { - if (assignee.get().equals(terminalUser.username())) return; - issues.addWatcher(issueInfo.id(), assignee.get()); - } - - Long propertyId = ownerOf(application).getPropertyId().get().value(); - - UserContact escalationTarget = contacts.escalationTargetFor(propertyId, assignee.orElse("no one")); - if (escalationTarget.is(assignee.orElse("no one"))) - escalationTarget = terminalUser; - - String comment = deploymentIssueEscalationComment(application, propertyId, assignee.orElse("anyone")); - - issues.comment(issueInfo.id(), comment); - issues.reassign(issueInfo.id(), escalationTarget.username()); - } - - Issue deploymentIssueFrom(Application application) { - return new Issue(deploymentIssueSummary(application), deploymentIssueDescription(application)) - .with(vespaOps); - } - - Issue manyFailingDeploymentsIssueFrom(Collection<Application> applications) { - return new Issue( - "More than 20% of Hosted Vespa deployments are failing", - applications.stream() - .map(application -> "[" + application.id().toShortString() + "|" + toUrl(application.id()) + "]") - .collect(Collectors.joining("\n")), - vespaOps); + private Tenant ownerOf(ApplicationId applicationId) { + return controller().tenants().tenant(new TenantId(applicationId.tenant().value())) + .orElseThrow(() -> new IllegalStateException("No tenant found for application " + applicationId)); } - // TODO: Use the method of the same name in ApplicationId - private static String toShortString(ApplicationId id) { - return id.tenant().value() + "." + id.application().value() + - ( id.instance().isDefault() ? "" : "." + id.instance().value() ); + private User userFor(Tenant tenant) { + return User.from(tenant.getId().id().replaceFirst("by-", "")); } - private String toUrl(ApplicationId applicationId) { - return controller().zoneRegistry().getDashboardUri().resolve("/apps" + - "/tenant/" + applicationId.tenant().value() + - "/application/" + applicationId.application().value()).toString(); + private PropertyId propertyIdFor(Tenant tenant) { + return tenant.getPropertyId() + .orElseThrow(() -> new NoSuchElementException("No PropertyId is listed for non-user tenant " + tenant)); } - private String toOpsDbUrl(long propertyId) { - return contacts.contactsUri(propertyId).toString(); - - } - - /** Returns the summary text what will be assigned to a new issue */ - private static String deploymentIssueSummary(Application application) { - return "[" + toShortString(application.id()) + "] Action required: Repair deployment"; - } - - /** Returns the description text what will be assigned to a new issue */ - private String deploymentIssueDescription(Application application) { - return "Deployment jobs of the Vespa application " + - "[" + toShortString(application.id()) + "|" + toUrl(application.id()) + "] have been failing " + - "continuously for over 48 hours. This blocks any change to this application from being deployed " + - "and will also block global rollout of new Vespa versions for everybody.\n\n" + - "Please assign your highest priority to fixing this. If you need support, request it using " + - "[yo/vespa-support|http://yo/vespa-support]. " + - "If this application is not in use, please re-assign this issue to project \"VESPA\" " + - "with component \"Services\", and ask for the application to be removed.\n\n" + - "If we do not get a response on this issue, we will auto-escalate it."; + /** File an issue for applicationId, if it doesn't already have an open issue associated with it. */ + private void fileDeploymentIssueFor(ApplicationId applicationId) { + try { + Tenant tenant = ownerOf(applicationId); + Optional<IssueId> ourIssueId = controller().applications().require(applicationId).deploymentJobs().issueId(); + IssueId issueId = tenant.tenantType() == TenantType.USER + ? deploymentIssues.fileUnlessOpen(ourIssueId, applicationId, userFor(tenant)) + : deploymentIssues.fileUnlessOpen(ourIssueId, applicationId, propertyIdFor(tenant)); + controller().applications().setIssueId(applicationId, issueId); + } + catch (RuntimeException e) { // Catch errors due to wrong data in the controller, or issues client timeout. + log.log(Level.WARNING, "Exception caught when attempting to file an issue for " + applicationId, e); + } } - /** Returns the comment text that what will be added to an issue each time it is escalated */ - private String deploymentIssueEscalationComment(Application application, long propertyId, String priorAssignee) { - return "This issue tracks the failing deployment of Vespa application " + - "[" + toShortString(application.id()) + "|" + toUrl(application.id()) + "]. " + - "Since we have not received a response from " + priorAssignee + - ", we are escalating to you, " + - "based on [your OpsDb information|" + toOpsDbUrl(propertyId) + "]. " + - "Please acknowledge this issue and assign somebody to " + - "fix it as soon as possible.\n\n" + - "If we do not receive a response we will keep auto-escalating this issue. " + - "If we run out of escalation options for your OpsDb property, we will assume this application " + - "is not managed by anyone and DELETE it. In the meantime, this issue will block global deployment " + - "of Vespa for the entire company."; + /** Escalate issues for which there has been no activity for a certain amount of time. */ + private void escalateInactiveDeploymentIssues(Collection<Application> applications) { + applications.forEach(application -> application.deploymentJobs().issueId().ifPresent(issueId -> { + try { + deploymentIssues.escalateIfInactive(issueId, ownerOf(application.id()).getPropertyId(), maxInactivity); + } + catch (RuntimeException e) { + log.log(Level.WARNING, "Exception caught when attempting to escalate issue with id " + issueId, e); + } + })); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/ApplicationSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/ApplicationSerializer.java index 81b3cb635ef..fd9a46339b6 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/ApplicationSerializer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/ApplicationSerializer.java @@ -14,6 +14,7 @@ import com.yahoo.slime.Inspector; import com.yahoo.slime.Slime; import com.yahoo.vespa.config.SlimeUtils; import com.yahoo.vespa.hosted.controller.Application; +import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; import com.yahoo.vespa.hosted.controller.application.ApplicationRevision; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.Deployment; @@ -60,7 +61,7 @@ public class ApplicationSerializer { // DeploymentJobs fields private final String projectIdField = "projectId"; private final String jobStatusField = "jobStatus"; - private final String jiraIssueIdField = "jiraIssueId"; + private final String issueIdField = "jiraIssueId"; // JobStatus field private final String jobTypeField = "jobType"; @@ -125,7 +126,7 @@ public class ApplicationSerializer { .filter(id -> id > 0) // TODO: Discards invalid data. Remove filter after October 2017 .ifPresent(projectId -> cursor.setLong(projectIdField, projectId)); jobStatusToSlime(deploymentJobs.jobStatus().values(), cursor.setArray(jobStatusField)); - deploymentJobs.jiraIssueId().ifPresent(jiraIssueId -> cursor.setString(jiraIssueIdField, jiraIssueId)); + deploymentJobs.issueId().ifPresent(jiraIssueId -> cursor.setString(issueIdField, jiraIssueId.value())); } private void jobStatusToSlime(Collection<JobStatus> jobStatuses, Cursor jobStatusArray) { @@ -218,9 +219,9 @@ public class ApplicationSerializer { Optional<Long> projectId = optionalLong(object.field(projectIdField)) .filter(id -> id > 0); // TODO: Discards invalid data. Remove filter after October 2017 List<JobStatus> jobStatusList = jobStatusListFromSlime(object.field(jobStatusField)); - Optional<String> jiraIssueKey = optionalString(object.field(jiraIssueIdField)); + Optional<IssueId> issueId = optionalString(object.field(issueIdField)).map(IssueId::from); - return new DeploymentJobs(projectId, jobStatusList, jiraIssueKey); + return new DeploymentJobs(projectId, jobStatusList, issueId); } private Optional<Change> changeFromSlime(Inspector object) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTester.java index b5419bca0a5..1cf091fed2a 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTester.java @@ -27,7 +27,6 @@ import com.yahoo.vespa.hosted.controller.api.integration.chef.ChefMock; import com.yahoo.vespa.hosted.controller.api.integration.dns.MemoryNameService; import com.yahoo.vespa.hosted.controller.api.integration.entity.MemoryEntityService; import com.yahoo.vespa.hosted.controller.api.integration.github.GitHubMock; -import com.yahoo.vespa.hosted.controller.api.integration.jira.JiraMock; import com.yahoo.vespa.hosted.controller.api.integration.routing.MemoryGlobalRoutingService; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.cost.CostMock; @@ -206,7 +205,6 @@ public final class ControllerTester { curator, new MemoryRotationRepository(), gitHubClientMock, - new JiraMock(), new MemoryEntityService(), new MemoryGlobalRoutingService(), zoneRegistryMock, diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java index f8d09ac8b27..aa93fb1cfe2 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java @@ -1,13 +1,11 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Environment; import com.yahoo.vespa.hosted.controller.Application; -import com.yahoo.vespa.hosted.controller.api.integration.Contacts.UserContact; -import com.yahoo.vespa.hosted.controller.api.integration.Issues; -import com.yahoo.vespa.hosted.controller.api.integration.Issues.IssueInfo; -import com.yahoo.vespa.hosted.controller.api.integration.stubs.ContactsMock; -import com.yahoo.vespa.hosted.controller.api.integration.stubs.PropertiesMock; +import com.yahoo.vespa.hosted.controller.api.integration.organization.IssueId; +import com.yahoo.vespa.hosted.controller.api.integration.stubs.LoggingDeploymentIssues; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; @@ -15,28 +13,18 @@ import com.yahoo.vespa.hosted.controller.persistence.MockCuratorDb; import org.junit.Before; import org.junit.Test; -import java.time.Clock; import java.time.Duration; -import java.util.Arrays; import java.util.HashMap; -import java.util.List; import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; -import static com.yahoo.vespa.hosted.controller.api.integration.Contacts.Category.admin; -import static com.yahoo.vespa.hosted.controller.api.integration.Contacts.Category.engineeringOwner; -import static com.yahoo.vespa.hosted.controller.api.integration.Issues.IssueInfo.Status.done; -import static com.yahoo.vespa.hosted.controller.api.integration.Issues.IssueInfo.Status.toDo; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.component; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.productionCorpUsEast1; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.stagingTest; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest; import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxFailureAge; -import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxInactivityAge; -import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.terminalUser; -import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.vespaOps; +import static com.yahoo.vespa.hosted.controller.maintenance.DeploymentIssueReporter.maxInactivity; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; /** @@ -51,27 +39,18 @@ public class DeploymentIssueReporterTest { private DeploymentTester tester; private DeploymentIssueReporter reporter; - private ContactsMock contacts; - private PropertiesMock properties; - private MockIssues issues; + private MockDeploymentIssues issues; @Before public void setup() { tester = new DeploymentTester(); - contacts = new ContactsMock(); - properties = new PropertiesMock(); - issues = new MockIssues(tester.clock()); - reporter = new DeploymentIssueReporter(tester.controller(), contacts, properties, issues, Duration.ofMinutes(5), - new JobControl(new MockCuratorDb())); - } - - private List<IssueInfo> openIssuesFor(Application application) { - return issues.fetchSimilarTo(reporter.deploymentIssueFrom(tester.controller().applications().require(application.id()))); + issues = new MockDeploymentIssues(); + reporter = new DeploymentIssueReporter(tester.controller(), issues, Duration.ofMinutes(5), new JobControl(new MockCuratorDb())); } @Test public void testDeploymentFailureReporting() { - // All applications deploy from unique SD projects. + // All applications deploy from unique build projects. Long projectId1 = 10L; Long projectId2 = 20L; Long projectId3 = 30L; @@ -79,11 +58,12 @@ public class DeploymentIssueReporterTest { // Only the first two have propertyIds set now. Long propertyId1 = 1L; Long propertyId2 = 2L; + Long propertyId3 = 3L; // Create and deploy one application for each of three tenants. Application app1 = tester.createApplication("application1", "tenant1", projectId1, propertyId1); Application app2 = tester.createApplication("application2", "tenant2", projectId2, propertyId2); - Application app3 = tester.createApplication("application3", "tenant3", projectId3, null); + Application app3 = tester.createApplication("application3", "tenant3", projectId3, propertyId3); // And then we need lots of successful applications, so we won't assume we just have a faulty Vespa out. for (long i = 4; i <= 10; i++) { @@ -93,18 +73,6 @@ public class DeploymentIssueReporterTest { tester.deployAndNotify(app, applicationPackage, true, stagingTest); tester.deployAndNotify(app, applicationPackage, true, productionCorpUsEast1); } - - // Both the first tenants belong to the same JIRA queue. (Not sure if this is possible, but let's test it anyway. - String jiraQueue = "PROJECT"; - properties.addClassification(propertyId1, jiraQueue); - properties.addClassification(propertyId1, jiraQueue); - - // Only tenant1 has contacts listed in opsDb. - UserContact - alice = new UserContact("alice", "Alice", admin), - bob = new UserContact("bob", "Robert", engineeringOwner); - contacts.addContact(propertyId1, Arrays.asList(alice, bob)); - // end of setup. // NOTE: All maintenance should be idempotent within a small enough time interval, so maintain is called twice in succession throughout. @@ -125,7 +93,7 @@ public class DeploymentIssueReporterTest { reporter.maintain(); reporter.maintain(); - assertEquals("No deployments are detected as failing for a long time initially.", 0, issues.issues.size()); + assertEquals("No deployments are detected as failing for a long time initially.", 0, issues.size()); // Advance to where deployment issues should be detected. @@ -133,146 +101,103 @@ public class DeploymentIssueReporterTest { reporter.maintain(); reporter.maintain(); - assertEquals("One issue is produced for app1.", 1, openIssuesFor(app1).size()); - assertEquals("No issues are produced for app2.", 0, openIssuesFor(app2).size()); - assertEquals("One issue is produced for app3.", 1, openIssuesFor(app3).size()); - assertTrue("The issue for app1 is stored in their JIRA queue.", openIssuesFor(app1).get(0).key().startsWith(jiraQueue)); - assertTrue("The issue for an application without propertyId is addressed to vespaOps.", openIssuesFor(app3).get(0).key().startsWith(vespaOps.queue())); + assertTrue("One issue is produced for app1.", issues.isOpenFor(app1.id())); + assertFalse("No issues are produced for app2.", issues.isOpenFor(app2.id())); + assertTrue("One issue is produced for app3.", issues.isOpenFor(app3.id())); - // Verify idempotency of filing. - reporter.maintain(); - reporter.maintain(); - assertEquals("No issues are re-filed when still open.", 2, issues.issues.size()); - - - // tenant3 closes their issue prematurely; see that we get a new filing. - issues.complete(openIssuesFor(app3).get(0).id()); - assertEquals("The issue is removed (test of the tester, really...).", 0, openIssuesFor(app3).size()); + // app3 closes their issue prematurely; see that it is refiled. + issues.closeFor(app3.id()); + assertFalse("No issue is open for app3.", issues.isOpenFor(app3.id())); reporter.maintain(); reporter.maintain(); - assertTrue("Issue is re-produced for app3, addressed correctly.", openIssuesFor(app3).get(0).key().startsWith(vespaOps.queue())); + assertTrue("Issue is re-filed for app3.", issues.isOpenFor(app3.id())); // Some time passes; tenant1 leaves her issue unattended, while tenant3 starts work and updates the issue. // app2 also has an intermittent failure; see that we detect this as a Vespa problem, and file an issue to ourselves. tester.deployAndNotify(app2, applicationPackage, false, productionCorpUsEast1); - tester.clock().advance(maxInactivityAge.plus(maxFailureAge)); - issues.comment(openIssuesFor(app3).get(0).id(), "We are trying to fix it!"); - - reporter.maintain(); - reporter.maintain(); - assertEquals("The issue for app1 is escalated once.", alice.username(), openIssuesFor(app1).get(0).assignee().get()); - + tester.clock().advance(maxInactivity.plus(maxFailureAge)); + issues.touchFor(app3.id()); + assertFalse("We have no platform issues initially.", issues.platformIssue()); reporter.maintain(); reporter.maintain(); - assertEquals("We get an issue to vespaOps when more than 20% of applications have old failures.", 1, - issues.fetchSimilarTo(reporter.manyFailingDeploymentsIssueFrom(Arrays.asList( - tester.controller().applications().get(app1.id()).get(), - tester.controller().applications().get(app2.id()).get(), - tester.controller().applications().get(app3.id()).get()))).size()); - assertEquals("No issue is filed for app2 while Vespa is considered broken.", 0, openIssuesFor(app2).size()); + assertEquals("The issue for app1 is escalated once.", 1, issues.escalationLevelFor(app1.id())); + assertTrue("We get a platform issue when more than 20% of applications are failing.", issues.platformIssue()); + assertFalse("No issue is filed for app2 while Vespa is considered broken.", issues.isOpenFor(app2.id())); // app3 fixes its problem, but the ticket is left open; see the resolved ticket is not escalated when another escalation period has passed. tester.deployAndNotify(app2, applicationPackage, true, productionCorpUsEast1); tester.deployAndNotify(app3, applicationPackage, true, productionCorpUsEast1); - tester.clock().advance(maxInactivityAge.plus(Duration.ofDays(1))); + tester.clock().advance(maxInactivity.plus(Duration.ofDays(1))); reporter.maintain(); reporter.maintain(); - assertEquals("The issue for app1 is escalated once more.", bob.username(), openIssuesFor(app1).get(0).assignee().get()); - assertEquals("The issue for app3 is still unassigned.", Optional.empty(), openIssuesFor(app3).get(0).assignee()); + assertFalse("We no longer have a platform issue.", issues.platformIssue()); + assertEquals("The issue for app1 is escalated once more.", 2, issues.escalationLevelFor(app1.id())); + assertEquals("The issue for app3 is not escalated.", 0, issues.escalationLevelFor(app3.id())); - // app1 still does nothing with their issue; see the terminal user gets it in the end. // app3 now has a new failure past max failure age; see that a new issue is filed. tester.notifyJobCompletion(component, app3, true); tester.deployAndNotify(app3, applicationPackage, true, systemTest); tester.deployAndNotify(app3, applicationPackage, true, stagingTest); tester.deployAndNotify(app3, applicationPackage, false, productionCorpUsEast1); - tester.clock().advance(maxInactivityAge.plus(maxFailureAge)); + tester.clock().advance(maxInactivity.plus(maxFailureAge)); reporter.maintain(); reporter.maintain(); - assertEquals("The issue for app1 is escalated to the terminal user.", terminalUser.username(), openIssuesFor(app1).get(0).assignee().get()); - assertEquals("A new issue is filed for app3.", 2, openIssuesFor(app3).size()); + assertTrue("A new issue is filed for app3.", issues.isOpenFor(app3.id())); } - class MockIssues implements Issues { - final Map<String, Issue> issues = new HashMap<>(); - final Map<String, IssueInfo> metas = new HashMap<>(); - final Map<String, Long> counters = new HashMap<>(); - Clock clock; + class MockDeploymentIssues extends LoggingDeploymentIssues { - MockIssues(Clock clock) { this.clock = clock; } + Map<ApplicationId, IssueId> applicationIssues = new HashMap<>(); + Map<IssueId, Integer> issueLevels = new HashMap<>(); - public void addWatcher(String jiraIssueId, String watcher) { - touch(jiraIssueId); + MockDeploymentIssues() { + super(tester.clock()); } - public void reassign(String jiraIssueId, String assignee) { - metas.compute(jiraIssueId, (__, jiraIssueMeta) -> - new IssueInfo( - jiraIssueId, - jiraIssueMeta.key(), - clock.instant(), - Optional.of(assignee), - jiraIssueMeta.status())); + @Override + protected void escalateIssue(IssueId issueId) { + super.escalateIssue(issueId); + issueLevels.merge(issueId, 1, Integer::sum); } - public void comment(String jiraIssueId, String comment) { - touch(jiraIssueId); + @Override + protected IssueId fileIssue(ApplicationId applicationId) { + IssueId issueId = super.fileIssue(applicationId); + applicationIssues.put(applicationId, issueId); + return issueId; } - public void update(String jiraIssueId, String description) { - issues.compute(jiraIssueId, (__, issue) -> - new Issue(issue.summary(), description, issue.classification().orElse(null))); + void closeFor(ApplicationId applicationId) { + issueUpdates.remove(applicationIssues.remove(applicationId)); } - public String file(Issue issue) { - String jiraIssueId = (issues.size() + 1L) + ""; - Long counter = counters.merge(issue.classification().get().queue(), 0L, (old, __) -> old + 1); - String jiraIssueKey = issue.classification().get().queue() + '-' + counter; - issues.put(jiraIssueId, issue); - metas.put(jiraIssueId, new IssueInfo(jiraIssueId, jiraIssueKey, clock.instant(), null, toDo)); - return jiraIssueId; + void touchFor(ApplicationId applicationId) { + issueUpdates.put(applicationIssues.get(applicationId), tester.clock().instant()); } - public IssueInfo fetch(String jiraIssueId) { - return metas.get(jiraIssueId); + boolean isOpenFor(ApplicationId applicationId) { + return applicationIssues.containsKey(applicationId); } - public List<IssueInfo> fetchSimilarTo(Issue issue) { - return issues.entrySet().stream() - .filter(entry -> entry.getValue().summary().equals(issue.summary())) - .map(Map.Entry::getKey) - .map(metas::get) - .filter(meta -> meta.status() != done) - .collect(Collectors.toList()); + int escalationLevelFor(ApplicationId applicationId) { + return issueLevels.getOrDefault(applicationIssues.get(applicationId), 0); } - private void complete(String jiraIssueId) { - metas.compute(jiraIssueId, (__, jiraIssueMeta) -> - new IssueInfo( - jiraIssueId, - jiraIssueMeta.key(), - clock.instant(), - jiraIssueMeta.assignee(), - done)); + int size() { + return issueUpdates.size(); } - private void touch(String jiraIssueId) { - metas.compute(jiraIssueId, (__, jiraIssueMeta) -> - new IssueInfo( - jiraIssueId, - jiraIssueMeta.key(), - clock.instant(), - jiraIssueMeta.assignee(), - jiraIssueMeta.status())); + boolean platformIssue() { + return platformIssue.get(); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/ControllerContainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/ControllerContainerTest.java index 8b2595c6254..fbd581f3ba1 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/ControllerContainerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/ControllerContainerTest.java @@ -41,11 +41,8 @@ public class ControllerContainerTest { " <component id='com.yahoo.vespa.hosted.controller.api.integration.dns.MemoryNameService'/>" + " <component id='com.yahoo.vespa.hosted.controller.api.integration.entity.MemoryEntityService'/>" + " <component id='com.yahoo.vespa.hosted.controller.api.integration.github.GitHubMock'/>" + - " <component id='com.yahoo.vespa.hosted.controller.api.integration.jira.JiraMock'/>" + " <component id='com.yahoo.vespa.hosted.controller.api.integration.routing.MemoryGlobalRoutingService'/>" + - " <component id='com.yahoo.vespa.hosted.controller.api.integration.stubs.ContactsMock'/>" + - " <component id='com.yahoo.vespa.hosted.controller.api.integration.stubs.LoggingIssues'/>" + - " <component id='com.yahoo.vespa.hosted.controller.api.integration.stubs.PropertiesMock'/>" + + " <component id='com.yahoo.vespa.hosted.controller.api.integration.stubs.LoggingDeploymentIssues'/>" + " <component id='com.yahoo.vespa.hosted.controller.ConfigServerClientMock'/>" + " <component id='com.yahoo.vespa.hosted.controller.ZoneRegistryMock'/>" + " <component id='com.yahoo.vespa.hosted.controller.Controller'/>" + |