diff options
author | Martin Polden <mpolden@mpolden.no> | 2020-07-16 09:56:14 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2020-07-16 11:12:10 +0200 |
commit | f3efc9b88eba737b5036a60a381ced8960a26560 (patch) | |
tree | 03885dbf38ad5bbe636dd87c3528727779ff8de1 | |
parent | c608c8384315cebdc8adacb012a8c49a09cc0340 (diff) |
Emit QoS metric for all maintainers
67 files changed, 485 insertions, 235 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java b/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java index bc9fa96f943..937cf4dfe7f 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java @@ -206,6 +206,14 @@ public class ApplicationRepository implements com.yahoo.config.provision.Deploye this.metric = metric; } + public Clock clock() { + return clock; + } + + public Metric metric() { + return metric; + } + // ---------------- Deploying ---------------------------------------------------------------- public PrepareResult prepare(Tenant tenant, long sessionId, PrepareParams prepareParams, Instant now) { diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java index ccbad4e21c7..92044eab5fe 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java @@ -50,8 +50,9 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer { } @Override - protected void maintain() { - if (! distributeApplicationPackage.value()) return; + protected boolean maintain() { + boolean success = true; + if (! distributeApplicationPackage.value()) return success; try (var fileDownloader = new FileDownloader(createConnectionPool(configserverConfig), downloadDirectory)) { for (var applicationId : applicationRepository.listApplications()) { @@ -68,6 +69,7 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer { log.fine(() -> "Downloading missing application package for application " + applicationId + " - session " + sessionId); if (fileDownloader.getFile(applicationPackage).isEmpty()) { + success = false; log.warning("Failed to download application package for application " + applicationId + " - session " + sessionId); continue; } @@ -76,6 +78,7 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer { } } } + return success; } private void createLocalSessionIfMissing(ApplicationId applicationId, long sessionId) { diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java index 5369bbef366..007ca8dcf53 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java @@ -3,7 +3,9 @@ package com.yahoo.vespa.config.server.maintenance; import com.yahoo.concurrent.maintenance.JobControl; import com.yahoo.concurrent.maintenance.JobControlState; +import com.yahoo.concurrent.maintenance.JobMetrics; import com.yahoo.concurrent.maintenance.Maintainer; +import com.yahoo.jdisc.Metric; import com.yahoo.path.Path; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.config.server.ApplicationRepository; @@ -12,7 +14,9 @@ import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.ListFlag; +import java.time.Clock; import java.time.Duration; +import java.util.Map; import java.util.Set; /** @@ -26,16 +30,25 @@ public abstract class ConfigServerMaintainer extends Maintainer { ConfigServerMaintainer(ApplicationRepository applicationRepository, Curator curator, FlagSource flagSource, Duration initialDelay, Duration interval) { - super(null, interval, initialDelay, new JobControl(new JobControlFlags(curator, flagSource))); + super(null, interval, initialDelay, new JobControl(new JobControlFlags(curator, flagSource)), + jobMetrics(applicationRepository.clock(), applicationRepository.metric())); this.applicationRepository = applicationRepository; } + private static JobMetrics jobMetrics(Clock clock, Metric metric) { + return new JobMetrics(clock, (job, instant) -> { + Duration sinceSuccess = Duration.between(instant, clock.instant()); + metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job))); + }); + } + private static class JobControlFlags implements JobControlState { private static final Path root = Path.fromString("/configserver/v1/"); - private static final Path lockRoot = root.append("locks"); + private static final Path lockRoot = root.append("locks"); private final Curator curator; + private final ListFlag<String> inactiveJobsFlag; public JobControlFlags(Curator curator, FlagSource flagSource) { diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java index a6585be391c..adcaa3bb0e4 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java @@ -5,6 +5,7 @@ import com.google.inject.Inject; import com.yahoo.cloud.config.ConfigserverConfig; import com.yahoo.component.AbstractComponent; import com.yahoo.config.provision.SystemName; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.config.server.ApplicationRepository; import com.yahoo.vespa.config.server.filedistribution.FileDistributionFactory; import com.yahoo.vespa.curator.Curator; @@ -31,7 +32,8 @@ public class ConfigServerMaintenance extends AbstractComponent { ApplicationRepository applicationRepository, Curator curator, FileDistributionFactory fileDistributionFactory, - FlagSource flagSource) { + FlagSource flagSource, + Metric metric) { DefaultTimes defaults = new DefaultTimes(configserverConfig); // TODO: Disabled until we have application metadata //tenantsMaintainer = new TenantsMaintainer(applicationRepository, curator, defaults.tenantsMaintainerInterval); diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java index ed57be799c7..835122c043c 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java @@ -35,8 +35,8 @@ public class FileDistributionMaintainer extends ConfigServerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { applicationRepository.deleteUnusedFiledistributionReferences(fileReferencesDir, maxUnusedFileReferenceAge); - + return true; } } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java index 4adf287448d..77da56588ba 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java @@ -26,7 +26,7 @@ public class SessionsMaintainer extends ConfigServerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { applicationRepository.deleteExpiredLocalSessions(); // Expired remote sessions are sessions that belong to an application that have external deployments that @@ -41,5 +41,7 @@ public class SessionsMaintainer extends ConfigServerMaintainer { int deleted = applicationRepository.deleteExpiredLocks(lockExpiryTime); if (deleted > 0) log.log(LogLevel.INFO, "Deleted " + deleted + " locks older than " + lockExpiryTime); + + return true; } } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java index 9a81d9f7547..d29eea842f5 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java @@ -28,8 +28,9 @@ public class TenantsMaintainer extends ConfigServerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { applicationRepository.deleteUnusedTenants(ttlForUnusedTenant, clock.instant()); + return true; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java index 786819d9442..1f20e48edf5 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java @@ -17,6 +17,7 @@ import com.yahoo.yolean.Exceptions; import java.time.Duration; import java.util.HashMap; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Level; /** @@ -38,14 +39,15 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { } @Override - protected void maintain() { - confirmApplicationOwnerships(); - ensureConfirmationResponses(); - updateConfirmedApplicationOwners(); + protected boolean maintain() { + return confirmApplicationOwnerships() & + ensureConfirmationResponses() & + updateConfirmedApplicationOwners(); } /** File an ownership issue with the owners of all applications we know about. */ - private void confirmApplicationOwnerships() { + private boolean confirmApplicationOwnerships() { + AtomicBoolean success = new AtomicBoolean(true); applications() .withProjectId() .withProductionDeployment() @@ -63,10 +65,11 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { }).ifPresent(newIssueId -> store(newIssueId, application.id())); } catch (RuntimeException e) { // Catch errors due to wrong data in the controller, or issues client timeout. + success.set(false); log.log(Level.INFO, "Exception caught when attempting to file an issue for '" + application.id() + "': " + Exceptions.toMessageString(e)); } }); - + return success.get(); } private ApplicationSummary summaryOf(TenantAndApplicationId application) { @@ -85,7 +88,8 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { } /** Escalate ownership issues which have not been closed before a defined amount of time has passed. */ - private void ensureConfirmationResponses() { + private boolean ensureConfirmationResponses() { + AtomicBoolean success = new AtomicBoolean(true); for (Application application : applications()) application.ownershipIssueId().ifPresent(issueId -> { try { @@ -93,12 +97,14 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { ownershipIssues.ensureResponse(issueId, tenant.contact()); } catch (RuntimeException e) { + success.set(false); log.log(Level.INFO, "Exception caught when attempting to escalate issue with id '" + issueId + "': " + Exceptions.toMessageString(e)); } }); + return success.get(); } - private void updateConfirmedApplicationOwners() { + private boolean updateConfirmedApplicationOwners() { applications() .withProjectId() .withProductionDeployment() @@ -112,6 +118,7 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { controller().applications().store(lockedApplication.withOwner(owner))); }); }); + return true; } private ApplicationList applications() { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java index 4b96bd404ee..10e5431dac1 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java @@ -24,6 +24,7 @@ import java.util.stream.Collectors; * Automatically fetches and handles scheduled events from AWS: * 1. Deprovisions the affected hosts if applicable * 2. Submits an issue detailing the event if some hosts are not processed by 1. + * * @author mgimle */ public class CloudEventReporter extends ControllerMaintainer { @@ -44,8 +45,7 @@ public class CloudEventReporter extends ControllerMaintainer { } @Override - protected void maintain() { - log.log(Level.INFO, "Fetching events for cloud hosts."); + protected boolean maintain() { for (var awsRegion : zonesByCloudNativeRegion.keySet()) { List<CloudEvent> events = eventFetcher.getEvents(awsRegion); for (var event : events) { @@ -56,6 +56,7 @@ public class CloudEventReporter extends ControllerMaintainer { submitIssue(event, deprovisionedHosts); } } + return true; } private List<String> deprovisionHosts(String awsRegion, CloudEvent event) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java index 4aba8d881bf..e19f3b4f9a2 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java @@ -35,8 +35,9 @@ public class ContactInformationMaintainer extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { TenantController tenants = controller().tenants(); + boolean success = true; for (Tenant tenant : tenants.asList()) { log.log(INFO, "Updating contact information for " + tenant); try { @@ -55,11 +56,13 @@ public class ContactInformationMaintainer extends ControllerMaintainer { throw new IllegalArgumentException("Unexpected tenant type '" + tenant.type() + "'."); } } catch (Exception e) { + success = false; log.log(Level.WARNING, "Failed to update contact information for " + tenant + ": " + Exceptions.toMessageString(e) + ". Retrying in " + interval()); } } + return success; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java index 2b7c78f96d0..76003a873fe 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java @@ -1,12 +1,16 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.yahoo.concurrent.maintenance.JobMetrics; import com.yahoo.concurrent.maintenance.Maintainer; import com.yahoo.config.provision.SystemName; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.controller.Controller; +import java.time.Clock; import java.time.Duration; import java.util.EnumSet; +import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.logging.Logger; @@ -30,7 +34,8 @@ public abstract class ControllerMaintainer extends Maintainer { } public ControllerMaintainer(Controller controller, Duration interval, String name, Set<SystemName> activeSystems) { - super(name, interval, controller.clock().instant(), controller.jobControl(), controller.curator().cluster()); + super(name, interval, controller.clock().instant(), controller.jobControl(), + jobMetrics(controller.clock(), controller.metric()), controller.curator().cluster()); this.controller = controller; this.activeSystems = Set.copyOf(Objects.requireNonNull(activeSystems)); } @@ -43,4 +48,11 @@ public abstract class ControllerMaintainer extends Maintainer { super.run(); } + private static JobMetrics jobMetrics(Clock clock, Metric metric) { + return new JobMetrics(clock, (job, instant) -> { + Duration sinceSuccess = Duration.between(instant, clock.instant()); + metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job))); + }); + } + } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java index d028a88fb92..28b64b5bfe0 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java @@ -31,9 +31,10 @@ public class CostReportMaintainer extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { var csv = CostCalculator.resourceShareByPropertyToCsv(nodeRepository, controller(), clock, consumer.fixedAllocations()); consumer.consume(csv); + return true; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java index bb2161bca1d..7bd2c737fcb 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java @@ -1,7 +1,6 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; -import com.yahoo.concurrent.maintenance.JobControl; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; @@ -24,7 +23,8 @@ public class DeploymentExpirer extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { + boolean success = true; for (Application application : controller().applications().readable()) for (Instance instance : application.instances().values()) for (Deployment deployment : instance.deployments().values()) { @@ -34,11 +34,13 @@ public class DeploymentExpirer extends ControllerMaintainer { log.log(Level.INFO, "Expiring deployment of " + instance.id() + " in " + deployment.zone()); controller().applications().deactivate(instance.id(), deployment.zone()); } catch (Exception e) { + success = false; log.log(Level.WARNING, "Could not expire " + deployment + " of " + instance + ": " + Exceptions.toMessageString(e) + ". Retrying in " + interval()); } } + return success; } /** Returns whether given deployment has expired according to its TTL */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java index 89f1e0fe840..a94e7407898 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java @@ -2,7 +2,6 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.component.Version; -import com.yahoo.concurrent.maintenance.JobControl; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.SystemName; import com.yahoo.vespa.hosted.controller.Application; @@ -20,6 +19,7 @@ import java.time.Duration; import java.util.Collection; import java.util.List; import java.util.Optional; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Level; import static com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence.broken; @@ -44,10 +44,10 @@ public class DeploymentIssueReporter extends ControllerMaintainer { } @Override - protected void maintain() { - maintainDeploymentIssues(applications()); - maintainPlatformIssue(applications()); - escalateInactiveDeploymentIssues(applications()); + protected boolean maintain() { + return maintainDeploymentIssues(applications()) & + maintainPlatformIssue(applications()) & + escalateInactiveDeploymentIssues(applications()); } /** Returns the applications to maintain issue status for. */ @@ -62,7 +62,7 @@ public class DeploymentIssueReporter extends ControllerMaintainer { * and store the issue id for the filed issues. Also, clear the issueIds of applications * where deployment has not failed for this amount of time. */ - private void maintainDeploymentIssues(List<Application> applications) { + private boolean maintainDeploymentIssues(List<Application> applications) { List<TenantAndApplicationId> failingApplications = controller().jobController().deploymentStatuses(ApplicationList.from(applications)) .failingApplicationChangeSince(controller().clock().instant().minus(maxFailureAge)) .mapToList(status -> status.application().id()); @@ -72,6 +72,7 @@ public class DeploymentIssueReporter extends ControllerMaintainer { fileDeploymentIssueFor(application); else store(application.id(), null); + return true; } /** @@ -79,24 +80,26 @@ public class DeploymentIssueReporter extends ControllerMaintainer { * applications that have been failing the upgrade to the system version for * longer than the set grace period, or update this list if the issue already exists. */ - private void maintainPlatformIssue(List<Application> applications) { + private boolean maintainPlatformIssue(List<Application> applications) { + boolean success = true; if (controller().system() == SystemName.cd) - return; + return success; Version systemVersion = controller().systemVersion(); if ((controller().versionStatus().version(systemVersion).confidence() != broken)) - return; + return success; DeploymentStatusList statuses = controller().jobController().deploymentStatuses(ApplicationList.from(applications)); if (statuses.failingUpgradeToVersionSince(systemVersion, controller().clock().instant().minus(upgradeGracePeriod)).isEmpty()) - return; + return success; List<ApplicationId> failingApplications = statuses.failingUpgradeToVersionSince(systemVersion, controller().clock().instant()) .mapToList(status -> status.application().id().defaultInstance()); // TODO jonmv: Send only tenant and application, here and elsewhere in this. deploymentIssues.fileUnlessOpen(failingApplications, systemVersion); + return success; } private Tenant ownerOf(TenantAndApplicationId applicationId) { @@ -121,7 +124,8 @@ public class DeploymentIssueReporter extends ControllerMaintainer { } /** Escalate issues for which there has been no activity for a certain amount of time. */ - private void escalateInactiveDeploymentIssues(Collection<Application> applications) { + private boolean escalateInactiveDeploymentIssues(Collection<Application> applications) { + AtomicBoolean success = new AtomicBoolean(true); applications.forEach(application -> application.deploymentIssueId().ifPresent(issueId -> { try { Tenant tenant = ownerOf(application.id()); @@ -130,9 +134,11 @@ public class DeploymentIssueReporter extends ControllerMaintainer { tenant.type() == Tenant.Type.athenz ? tenant.contact() : Optional.empty()); } catch (RuntimeException e) { + success.set(false); log.log(Level.INFO, "Exception caught when attempting to escalate issue with id '" + issueId + "': " + Exceptions.toMessageString(e)); } })); + return success.get(); } private void store(TenantAndApplicationId id, IssueId issueId) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java index c03be2ca1d1..c8416578932 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java @@ -1,7 +1,6 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; -import com.yahoo.concurrent.maintenance.JobControl; import com.yahoo.config.provision.SystemName; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; @@ -39,7 +38,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { AtomicInteger failures = new AtomicInteger(0); AtomicInteger attempts = new AtomicInteger(0); AtomicReference<Exception> lastException = new AtomicReference<>(null); @@ -91,6 +90,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer { } catch (InterruptedException e) { throw new RuntimeException(e); } + return lastException.get() == null; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java index 7006458538d..7952355d5fb 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java @@ -35,8 +35,9 @@ public abstract class InfrastructureUpgrader<VERSION> extends ControllerMaintain } @Override - protected void maintain() { + protected boolean maintain() { targetVersion().ifPresent(target -> upgradeAll(target, SystemApplication.all())); + return true; } /** Deploy a list of system applications until they converge on the given version */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java index cfe9257bdf8..e0f2f0718ef 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java @@ -48,9 +48,10 @@ public class JobRunner extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { jobs.active().forEach(this::advance); jobs.collectGarbage(); + return true; } @Override diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index cc4a8c628eb..0c5ef123eef 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -68,12 +68,13 @@ public class MetricsReporter extends ControllerMaintainer { } @Override - public void maintain() { + public boolean maintain() { reportDeploymentMetrics(); reportRemainingRotations(); reportQueuedNameServiceRequests(); reportInfrastructureUpgradeMetrics(); reportAuditLog(); + return true; } private void reportAuditLog() { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java index 9febc73a5a7..e223809a211 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java @@ -38,12 +38,13 @@ public class NameServiceDispatcher extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { + boolean success = true; try (var lock = db.lockNameServiceQueue()) { var queue = db.readNameServiceQueue(); var instant = clock.instant(); var remaining = queue.dispatchTo(nameService, requestCount); - if (queue == remaining) return; // Queue unchanged + if (queue == remaining) return success; // Queue unchanged var dispatched = queue.first(requestCount); if (!dispatched.requests().isEmpty()) { @@ -53,6 +54,7 @@ public class NameServiceDispatcher extends ControllerMaintainer { } db.writeNameServiceQueue(remaining); } + return success; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java index a62b1745145..20febfaea1d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java @@ -1,7 +1,6 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; -import com.yahoo.concurrent.maintenance.JobControl; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.versions.OsVersionStatus; import com.yahoo.yolean.Exceptions; @@ -19,14 +18,16 @@ public class OsVersionStatusUpdater extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { try { OsVersionStatus newStatus = OsVersionStatus.compute(controller()); controller().updateOsVersionStatus(newStatus); + return true; } catch (Exception e) { log.log(Level.WARNING, "Failed to compute version status: " + Exceptions.toMessageString(e) + ". Retrying in " + interval()); } + return false; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java index 5dd62251759..a032f266de5 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java @@ -19,12 +19,13 @@ public class OutstandingChangeDeployer extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { for (Application application : ApplicationList.from(controller().applications().readable()) .withProductionDeployment() .withDeploymentSpec() .asList()) controller().applications().deploymentTrigger().triggerNewRevision(application.id()); + return true; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java index 32b65f05cac..a626f21359a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java @@ -1,7 +1,6 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; -import com.yahoo.concurrent.maintenance.JobControl; import com.yahoo.vespa.hosted.controller.Controller; import java.time.Duration; @@ -18,8 +17,9 @@ public class ReadyJobsTrigger extends ControllerMaintainer { } @Override - public void maintain() { + public boolean maintain() { controller().applications().deploymentTrigger().triggerReadyJobs(); + return true; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java index 76a186a2f6b..f460561df08 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java @@ -50,13 +50,15 @@ public class ResourceMeterMaintainer extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { try { collectResourceSnapshots(); + return true; } catch (Exception e) { log.log(Level.WARNING, "Failed to collect resource snapshots. Retrying in " + interval() + ". Error: " + Exceptions.toMessageString(e)); } + return false; } private void collectResourceSnapshots() { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java index 31434de472d..863302223ac 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java @@ -1,7 +1,6 @@ // Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; -import com.yahoo.concurrent.maintenance.JobControl; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.CloudName; import com.yahoo.config.provision.HostName; @@ -27,7 +26,7 @@ public class ResourceTagMaintainer extends ControllerMaintainer { } @Override - public void maintain() { + public boolean maintain() { controller().zoneRegistry().zones() .ofCloud(CloudName.from("aws")) .reachable() @@ -37,8 +36,7 @@ public class ResourceTagMaintainer extends ControllerMaintainer { if (taggedResources > 0) log.log(Level.INFO, "Tagged " + taggedResources + " resources in " + zone.getId()); }); - - + return true; } private Map<HostName, ApplicationId> getTenantOfParentHosts(ZoneId zoneId) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/RotationStatusUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/RotationStatusUpdater.java index 245747a882f..935bcbec597 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/RotationStatusUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/RotationStatusUpdater.java @@ -41,7 +41,7 @@ public class RotationStatusUpdater extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { var failures = new AtomicInteger(0); var attempts = new AtomicInteger(0); var lastException = new AtomicReference<Exception>(null); @@ -78,6 +78,7 @@ public class RotationStatusUpdater extends ControllerMaintainer { } catch (InterruptedException e) { throw new RuntimeException(e); } + return lastException.get() == null; } private RotationStatus getStatus(Instance instance) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java index 0fe6f7e0bfb..3b0a1fca4af 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java @@ -21,13 +21,14 @@ public class SystemRoutingPolicyMaintainer extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { for (var zone : controller().zoneRegistry().zones().all().ids()) { for (var application : SystemApplication.values()) { if (!application.hasEndpoint()) continue; controller().routing().policies().refresh(application.id(), DeploymentSpec.empty, zone); } } + return true; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index 5f0f2e4ba4e..9ab2b0e77e8 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -51,7 +51,7 @@ public class Upgrader extends ControllerMaintainer { * Schedule application upgrades. Note that this implementation must be idempotent. */ @Override - public void maintain() { + public boolean maintain() { // Determine target versions for each upgrade policy Version canaryTarget = controller().systemVersion(); Collection<Version> defaultTargets = targetVersions(Confidence.normal); @@ -89,6 +89,7 @@ public class Upgrader extends ControllerMaintainer { upgrade(instances.with(UpgradePolicy.canary), canaryTarget, instances.size()); defaultTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.defaultPolicy), target, numberOfApplicationsToUpgrade())); conservativeTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.conservative), target, numberOfApplicationsToUpgrade())); + return true; } /** Returns the target versions for given confidence, one per major version in the system */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java index d8b74a4ae99..a3e9672b715 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java @@ -29,7 +29,7 @@ public class VersionStatusUpdater extends ControllerMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { try { VersionStatus newStatus = VersionStatus.compute(controller()); controller().updateVersionStatus(newStatus); @@ -37,10 +37,12 @@ public class VersionStatusUpdater extends ControllerMaintainer { controller().serviceRegistry().systemMonitor().reportSystemVersion(version.versionNumber(), convert(version.confidence())); }); + return true; } catch (Exception e) { log.log(Level.WARNING, "Failed to compute version status: " + Exceptions.toMessageString(e) + ". Retrying in " + interval()); } + return false; } static SystemMonitor.Confidence convert(VespaVersion.Confidence confidence) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java index 1151fdd07f0..4218e66703f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.config.provision.SystemName; import com.yahoo.vespa.hosted.controller.ControllerTester; +import com.yahoo.vespa.hosted.controller.integration.MetricsMock; import org.junit.Before; import org.junit.Test; @@ -32,12 +33,21 @@ public class ControllerMaintainerTest { assertEquals(1, executions.get()); } + @Test + public void records_metric() { + maintainerIn(SystemName.main, new AtomicInteger()).run(); + MetricsMock metrics = (MetricsMock) tester.controller().metric(); + assertEquals(0L, metrics.getMetric((context) -> "MockMaintainer".equals(context.get("job")), + "maintenance.secondsSinceSuccess").get()); + } + private ControllerMaintainer maintainerIn(SystemName system, AtomicInteger executions) { return new ControllerMaintainer(tester.controller(), Duration.ofDays(1), "MockMaintainer", EnumSet.of(system)) { @Override - protected void maintain() { + protected boolean maintain() { executions.incrementAndGet(); + return true; } }; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index a762f718ab7..9980335bab0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -35,14 +35,15 @@ public abstract class ApplicationMaintainer extends NodeRepositoryMaintainer { new DaemonThreadFactory("node repo application maintainer")); protected ApplicationMaintainer(Deployer deployer, Metric metric, NodeRepository nodeRepository, Duration interval) { - super(nodeRepository, interval); + super(nodeRepository, interval, metric); this.deployer = deployer; this.metric = metric; } @Override - protected final void maintain() { + protected final boolean maintain() { applicationsNeedingMaintenance().forEach(this::deploy); + return true; } /** Returns the number of deployments that are pending execution */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index c32b7854d4e..e2b98d8d000 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -14,7 +14,6 @@ import com.yahoo.vespa.hosted.provision.applications.Applications; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaler; import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricsDb; -import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; import java.time.Duration; import java.util.List; @@ -38,17 +37,19 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { Deployer deployer, Metric metric, Duration interval) { - super(nodeRepository, interval); + super(nodeRepository, interval, metric); this.autoscaler = new Autoscaler(metricsDb, nodeRepository); this.metric = metric; this.deployer = deployer; } @Override - protected void maintain() { - if ( ! nodeRepository().zone().environment().isProduction()) return; + protected boolean maintain() { + boolean success = true; + if ( ! nodeRepository().zone().environment().isProduction()) return success; activeNodesByApplication().forEach((applicationId, nodes) -> autoscale(applicationId, nodes)); + return success; } private void autoscale(ApplicationId application, List<Node> applicationNodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java index f428e276df8..eb5973f11a9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; @@ -25,8 +26,8 @@ public class DirtyExpirer extends Expirer { private final NodeRepository nodeRepository; - DirtyExpirer(NodeRepository nodeRepository, Clock clock, Duration dirtyTimeout) { - super(Node.State.dirty, History.Event.Type.deallocated, nodeRepository, clock, dirtyTimeout); + DirtyExpirer(NodeRepository nodeRepository, Clock clock, Duration dirtyTimeout, Metric metric) { + super(Node.State.dirty, History.Event.Type.deallocated, nodeRepository, clock, dirtyTimeout, metric); this.nodeRepository = nodeRepository; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 0a32970e056..b9005a028ff 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -6,6 +6,7 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.OutOfCapacityException; +import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; @@ -49,19 +50,21 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { DynamicProvisioningMaintainer(NodeRepository nodeRepository, Duration interval, HostProvisioner hostProvisioner, - FlagSource flagSource) { - super(nodeRepository, interval); + FlagSource flagSource, + Metric metric) { + super(nodeRepository, interval, metric); this.hostProvisioner = hostProvisioner; this.targetCapacityFlag = Flags.TARGET_CAPACITY.bindTo(flagSource); } @Override - protected void maintain() { + protected boolean maintain() { try (Mutex lock = nodeRepository().lockUnallocated()) { NodeList nodes = nodeRepository().list(); resumeProvisioning(nodes, lock); convergeToCapacity(nodes); } + return true; } /** Resume provisioning of already provisioned hosts and their children */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java index dc5155312e7..43f5210b233 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.History; @@ -32,8 +33,8 @@ public abstract class Expirer extends NodeRepositoryMaintainer { private final Duration expiryTime; Expirer(Node.State fromState, History.Event.Type eventType, NodeRepository nodeRepository, - Clock clock, Duration expiryTime) { - super(nodeRepository, min(Duration.ofMinutes(10), expiryTime)); + Clock clock, Duration expiryTime, Metric metric) { + super(nodeRepository, min(Duration.ofMinutes(10), expiryTime), metric); this.fromState = fromState; this.eventType = eventType; this.clock = clock; @@ -41,7 +42,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { List<Node> expired = new ArrayList<>(); for (Node node : nodeRepository().getNodes(fromState)) { if (isExpired(node)) @@ -50,6 +51,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { if ( ! expired.isEmpty()) log.info(fromState + " expirer found " + expired.size() + " expired nodes: " + expired); expire(expired); + return true; } protected boolean isExpired(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index d65b4ce4248..3f8cc58540d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -6,6 +6,7 @@ import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.Zone; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; @@ -56,8 +57,8 @@ public class FailedExpirer extends NodeRepositoryMaintainer { private final Duration defaultExpiry; // Grace period to allow recovery of data private final Duration containerExpiry; // Stateless nodes, no data to recover - FailedExpirer(NodeRepository nodeRepository, Zone zone, Clock clock, Duration interval) { - super(nodeRepository, interval); + FailedExpirer(NodeRepository nodeRepository, Zone zone, Clock clock, Duration interval, Metric metric) { + super(nodeRepository, interval, metric); this.nodeRepository = nodeRepository; this.zone = zone; this.clock = clock; @@ -74,7 +75,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { List<Node> remainingNodes = new ArrayList<>(nodeRepository.list() .state(Node.State.failed) .nodeType(NodeType.tenant, NodeType.host) @@ -86,6 +87,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { node.history().hasEventBefore(History.Event.Type.failed, clock.instant().minus(containerExpiry))); recycleIf(remainingNodes, node -> node.history().hasEventBefore(History.Event.Type.failed, clock.instant().minus(defaultExpiry))); + return true; } /** Recycle the nodes matching condition, and remove those nodes from the nodes list. */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveExpirer.java index 3cb7cc218a7..389fc0ee907 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveExpirer.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; @@ -36,8 +37,8 @@ public class InactiveExpirer extends Expirer { private final NodeRepository nodeRepository; - InactiveExpirer(NodeRepository nodeRepository, Clock clock, Duration inactiveTimeout) { - super(Node.State.inactive, History.Event.Type.deactivated, nodeRepository, clock, inactiveTimeout); + InactiveExpirer(NodeRepository nodeRepository, Clock clock, Duration inactiveTimeout, Metric metric) { + super(Node.State.inactive, History.Event.Type.deactivated, nodeRepository, clock, inactiveTimeout, metric); this.nodeRepository = nodeRepository; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java index b933e549357..e317333135c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java @@ -2,10 +2,11 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.InfraDeployer; -import java.util.logging.Level; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.NodeRepository; import java.time.Duration; +import java.util.logging.Level; import java.util.logging.Logger; /** @@ -20,8 +21,8 @@ public class InfrastructureProvisioner extends NodeRepositoryMaintainer { private final InfraDeployer infraDeployer; - InfrastructureProvisioner(NodeRepository nodeRepository, InfraDeployer infraDeployer, Duration interval) { - super(nodeRepository, interval); + InfrastructureProvisioner(NodeRepository nodeRepository, InfraDeployer infraDeployer, Duration interval, Metric metric) { + super(nodeRepository, interval, metric); this.infraDeployer = infraDeployer; } @@ -38,7 +39,9 @@ public class InfrastructureProvisioner extends NodeRepositoryMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { infraDeployer.activateAllSupportedInfraApplications(false); + return true; } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java index 6edd57de1c1..90cf3ba8f54 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java @@ -1,6 +1,7 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.lb.LoadBalancer; @@ -39,17 +40,16 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { private final LoadBalancerService service; private final CuratorDatabaseClient db; - LoadBalancerExpirer(NodeRepository nodeRepository, Duration interval, LoadBalancerService service) { - super(nodeRepository, interval); + LoadBalancerExpirer(NodeRepository nodeRepository, Duration interval, LoadBalancerService service, Metric metric) { + super(nodeRepository, interval, metric); this.service = Objects.requireNonNull(service, "service must be non-null"); this.db = nodeRepository.database(); } @Override - protected void maintain() { + protected boolean maintain() { expireReserved(); - removeInactive(); - pruneReals(); + return removeInactive() & pruneReals(); } /** Move reserved load balancer that have expired to inactive */ @@ -63,7 +63,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } /** Deprovision inactive load balancers that have expired */ - private void removeInactive() { + private boolean removeInactive() { var failed = new ArrayList<LoadBalancerId>(); var lastException = new AtomicReference<Exception>(); var now = nodeRepository().clock().instant(); @@ -88,10 +88,11 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { interval()), lastException.get()); } + return lastException.get() == null; } /** Remove reals from inactive load balancers */ - private void pruneReals() { + private boolean pruneReals() { var failed = new ArrayList<LoadBalancerId>(); var lastException = new AtomicReference<Exception>(); withLoadBalancersIn(State.inactive, lb -> { @@ -109,13 +110,14 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { }); if (!failed.isEmpty()) { log.log(Level.WARNING, String.format("Failed to remove reals from %d load balancers: %s, retrying in %s", - failed.size(), - failed.stream() - .map(LoadBalancerId::serializedForm) - .collect(Collectors.joining(", ")), - interval()), + failed.size(), + failed.stream() + .map(LoadBalancerId::serializedForm) + .collect(Collectors.joining(", ")), + interval()), lastException.get()); } + return lastException.get() == null; } /** Apply operation to all load balancers that exist in given state, while holding lock */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index c631de5f17b..e0d7dc5f19e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -50,7 +50,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { Supplier<Integer> pendingRedeploymentsSupplier, Duration interval, Clock clock) { - super(nodeRepository, interval); + super(nodeRepository, interval, metric); this.metric = metric; this.orchestrator = orchestrator; this.serviceMonitor = serviceMonitor; @@ -59,7 +59,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { } @Override - public void maintain() { + public boolean maintain() { NodeList nodes = nodeRepository().list(); ServiceModel serviceModel = serviceMonitor.getServiceModelSnapshot(); @@ -68,6 +68,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { updateMaintenanceMetrics(); updateDockerMetrics(nodes); updateTenantUsageMetrics(nodes); + return true; } private void updateMaintenanceMetrics() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 9c1892a1920..a2a189769bf 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -78,7 +78,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { Duration downTimeLimit, Clock clock, Orchestrator orchestrator, ThrottlePolicy throttlePolicy, Metric metric) { // check ping status every five minutes, but at least twice as often as the down time limit - super(nodeRepository, min(downTimeLimit.dividedBy(2), Duration.ofMinutes(5))); + super(nodeRepository, min(downTimeLimit.dividedBy(2), Duration.ofMinutes(5)), metric); this.deployer = deployer; this.hostLivenessTracker = hostLivenessTracker; this.serviceMonitor = serviceMonitor; @@ -91,7 +91,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { int throttledHostFailures = 0; int throttledNodeFailures = 0; @@ -131,9 +131,11 @@ public class NodeFailer extends NodeRepositoryMaintainer { failActive(node, reason); } - metric.set(throttlingActiveMetric, Math.min( 1, throttledHostFailures + throttledNodeFailures), null); + int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures); + metric.set(throttlingActiveMetric, throttlingActive, null); metric.set(throttledHostFailuresMetric, throttledHostFailures, null); metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); + return throttlingActive == 0; } private void updateNodeLivenessEventsForReadyNodes(Mutex lock) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java index eb2b46dd53e..222ee631968 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java @@ -2,8 +2,9 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.vespa.hosted.provision.autoscale.NodeMetrics; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.autoscale.NodeMetrics; import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricsDb; import com.yahoo.yolean.Exceptions; @@ -26,14 +27,15 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { public NodeMetricsDbMaintainer(NodeRepository nodeRepository, NodeMetrics nodeMetrics, NodeMetricsDb nodeMetricsDb, - Duration interval) { - super(nodeRepository, interval); + Duration interval, + Metric metric) { + super(nodeRepository, interval, metric); this.nodeMetrics = nodeMetrics; this.nodeMetricsDb = nodeMetricsDb; } @Override - protected void maintain() { + protected boolean maintain() { int warnings = 0; for (ApplicationId application : activeNodesByApplication().keySet()) { try { @@ -46,6 +48,7 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { } } nodeMetricsDb.gc(nodeRepository().clock()); + return warnings == 0; } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java index c78ed72ff42..f64f27b1219 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Flavor; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.IntFlag; @@ -32,15 +33,15 @@ public class NodeRebooter extends NodeRepositoryMaintainer { private final Clock clock; private final Random random; - NodeRebooter(NodeRepository nodeRepository, Clock clock, FlagSource flagSource) { - super(nodeRepository, Duration.ofMinutes(25)); + NodeRebooter(NodeRepository nodeRepository, Clock clock, FlagSource flagSource, Metric metric) { + super(nodeRepository, Duration.ofMinutes(25), metric); this.rebootIntervalInDays = Flags.REBOOT_INTERVAL_IN_DAYS.bindTo(flagSource); this.clock = clock; this.random = new Random(clock.millis()); // seed with clock for test determinism } @Override - protected void maintain() { + protected boolean maintain() { // Reboot candidates: Nodes in long-term states, where we know we can safely orchestrate a reboot List<Node> nodesToReboot = nodeRepository().getNodes(Node.State.active, Node.State.ready).stream() .filter(node -> node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER) @@ -49,6 +50,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer { if (!nodesToReboot.isEmpty()) nodeRepository().reboot(NodeListFilter.from(nodesToReboot)); + return true; } private boolean shouldReboot(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index 8368569cda0..85477dad729 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -1,12 +1,15 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.concurrent.maintenance.JobMetrics; import com.yahoo.concurrent.maintenance.Maintainer; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.NodeType; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import java.time.Clock; import java.time.Duration; import java.util.List; import java.util.Map; @@ -21,8 +24,9 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { private final NodeRepository nodeRepository; - public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval) { - super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), nodeRepository.database().cluster()); + public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval, Metric metric) { + super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), + jobMetrics(nodeRepository.clock(), metric), nodeRepository.database().cluster()); this.nodeRepository = nodeRepository; } @@ -41,4 +45,11 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { .collect(Collectors.groupingBy(node -> node.allocation().get().owner())); } + private static JobMetrics jobMetrics(Clock clock, Metric metric) { + return new JobMetrics(clock, (job, instant) -> { + Duration sinceSuccess = Duration.between(instant, clock.instant()); + metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job))); + }); + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 4323622df8b..a5482281ef1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -75,25 +75,25 @@ public class NodeRepositoryMaintenance extends AbstractComponent { nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, defaults.failGrace, clock, orchestrator, throttlePolicyFromEnv().orElse(defaults.throttlePolicy), metric); periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, metric, nodeRepository, defaults.redeployMaintainerInterval, defaults.periodicRedeployInterval); operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.operatorChangeRedeployInterval); - reservationExpirer = new ReservationExpirer(nodeRepository, clock, defaults.reservationExpiry); + reservationExpirer = new ReservationExpirer(nodeRepository, clock, defaults.reservationExpiry, metric); retiredExpirer = new RetiredExpirer(nodeRepository, orchestrator, deployer, metric, clock, defaults.retiredInterval, defaults.retiredExpiry); - inactiveExpirer = new InactiveExpirer(nodeRepository, clock, defaults.inactiveExpiry); - failedExpirer = new FailedExpirer(nodeRepository, zone, clock, defaults.failedExpirerInterval); - dirtyExpirer = new DirtyExpirer(nodeRepository, clock, defaults.dirtyExpiry); - provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, defaults.provisionedExpiry); - nodeRebooter = new NodeRebooter(nodeRepository, clock, flagSource); + inactiveExpirer = new InactiveExpirer(nodeRepository, clock, defaults.inactiveExpiry, metric); + failedExpirer = new FailedExpirer(nodeRepository, zone, clock, defaults.failedExpirerInterval, metric); + dirtyExpirer = new DirtyExpirer(nodeRepository, clock, defaults.dirtyExpiry, metric); + provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, defaults.provisionedExpiry, metric); + nodeRebooter = new NodeRebooter(nodeRepository, clock, flagSource, metric); metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, defaults.metricsInterval, clock); - infrastructureProvisioner = new InfrastructureProvisioner(nodeRepository, infraDeployer, defaults.infrastructureProvisionInterval); + infrastructureProvisioner = new InfrastructureProvisioner(nodeRepository, infraDeployer, defaults.infrastructureProvisionInterval, metric); loadBalancerExpirer = provisionServiceProvider.getLoadBalancerService(nodeRepository).map(lbService -> - new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService)); + new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService, metric)); dynamicProvisioningMaintainer = provisionServiceProvider.getHostProvisioner().map(hostProvisioner -> - new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource)); + new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric)); spareCapacityMaintainer = new SpareCapacityMaintainer(deployer, nodeRepository, metric, defaults.spareCapacityMaintenanceInterval); - osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval); + osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval, metric); rebalancer = new Rebalancer(deployer, nodeRepository, metric, clock, defaults.rebalancerInterval); - nodeMetricsDbMaintainer = new NodeMetricsDbMaintainer(nodeRepository, nodeMetrics, nodeMetricsDb, defaults.nodeMetricsCollectionInterval); + nodeMetricsDbMaintainer = new NodeMetricsDbMaintainer(nodeRepository, nodeMetrics, nodeMetricsDb, defaults.nodeMetricsCollectionInterval, metric); autoscalingMaintainer = new AutoscalingMaintainer(nodeRepository, nodeMetricsDb, deployer, metric, defaults.autoscalingInterval); - scalingSuggestionsMaintainer = new ScalingSuggestionsMaintainer(nodeRepository, nodeMetricsDb, defaults.scalingSuggestionsInterval); + scalingSuggestionsMaintainer = new ScalingSuggestionsMaintainer(nodeRepository, nodeMetricsDb, defaults.scalingSuggestionsInterval, metric); // The DuperModel is filled with infrastructure applications by the infrastructure provisioner, so explicitly run that now infrastructureProvisioner.maintainButThrowOnException(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java index 11afbd785e8..be1190ccff4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.NodeType; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -17,17 +18,18 @@ import java.time.Duration; */ public class OsUpgradeActivator extends NodeRepositoryMaintainer { - public OsUpgradeActivator(NodeRepository nodeRepository, Duration interval) { - super(nodeRepository, interval); + public OsUpgradeActivator(NodeRepository nodeRepository, Duration interval, Metric metric) { + super(nodeRepository, interval, metric); } @Override - protected void maintain() { + protected boolean maintain() { for (var nodeType : NodeType.values()) { if (!nodeType.isHost()) continue; var active = canUpgradeOsOf(nodeType); nodeRepository().osVersions().resumeUpgradeOf(nodeType, active); } + return true; } /** Returns whether to allow OS upgrade of nodes of given type */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java index e1407f2a41d..d38bff091b0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; @@ -19,8 +20,8 @@ public class ProvisionedExpirer extends Expirer { private final NodeRepository nodeRepository; - ProvisionedExpirer(NodeRepository nodeRepository, Clock clock, Duration dirtyTimeout) { - super(Node.State.provisioned, History.Event.Type.provisioned, nodeRepository, clock, dirtyTimeout); + ProvisionedExpirer(NodeRepository nodeRepository, Clock clock, Duration dirtyTimeout, Metric metric) { + super(Node.State.provisioned, History.Event.Type.provisioned, nodeRepository, clock, dirtyTimeout, metric); this.nodeRepository = nodeRepository; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 3df20fa9d08..9b9c7df5d0d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -31,22 +31,24 @@ public class Rebalancer extends NodeRepositoryMaintainer { Metric metric, Clock clock, Duration interval) { - super(nodeRepository, interval); + super(nodeRepository, interval, metric); this.deployer = deployer; this.metric = metric; this.clock = clock; } @Override - protected void maintain() { - if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return; // Rebalancing not necessary - if (nodeRepository().zone().environment().isTest()) return; // Short lived deployments; no need to rebalance + protected boolean maintain() { + boolean success = true; + if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return success; // Rebalancing not necessary + if (nodeRepository().zone().environment().isTest()) return success; // Short lived deployments; no need to rebalance // Work with an unlocked snapshot as this can take a long time and full consistency is not needed NodeList allNodes = nodeRepository().list(); updateSkewMetric(allNodes); - if ( ! zoneIsStable(allNodes)) return; + if ( ! zoneIsStable(allNodes)) return success; findBestMove(allNodes).execute(true, Agent.Rebalancer, deployer, metric, nodeRepository()); + return success; } /** We do this here rather than in MetricsReporter because it is expensive and frequent updates are unnecessary */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirer.java index 03d466dbf09..27f77dd08a3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirer.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; @@ -22,8 +23,8 @@ public class ReservationExpirer extends Expirer { private final NodeRepository nodeRepository; - public ReservationExpirer(NodeRepository nodeRepository, Clock clock, Duration reservationPeriod) { - super(Node.State.reserved, History.Event.Type.reserved, nodeRepository, clock, reservationPeriod); + public ReservationExpirer(NodeRepository nodeRepository, Clock clock, Duration reservationPeriod, Metric metric) { + super(Node.State.reserved, History.Event.Type.reserved, nodeRepository, clock, reservationPeriod, metric); this.nodeRepository = nodeRepository; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index a8566e24743..5b7f90102ba 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -39,7 +39,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { Clock clock, Duration maintenanceInterval, Duration retiredExpiry) { - super(nodeRepository, maintenanceInterval); + super(nodeRepository, maintenanceInterval, metric); this.deployer = deployer; this.metric = metric; this.orchestrator = orchestrator; @@ -48,7 +48,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { } @Override - protected void maintain() { + protected boolean maintain() { List<Node> activeNodes = nodeRepository().getNodes(Node.State.active); Map<ApplicationId, List<Node>> retiredNodesByApplication = activeNodes.stream() @@ -69,11 +69,12 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { nodeRepository().setRemovable(application, nodesToRemove); boolean success = deployment.activate(); - if ( ! success) return; + if ( ! success) return success; String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", ")); log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList); } } + return true; } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java index b68e8eacbaa..b0c52d10f7d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java @@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -12,7 +13,6 @@ import com.yahoo.vespa.hosted.provision.applications.Applications; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaler; import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricsDb; -import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; import java.time.Duration; import java.util.List; @@ -31,16 +31,19 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer { public ScalingSuggestionsMaintainer(NodeRepository nodeRepository, NodeMetricsDb metricsDb, - Duration interval) { - super(nodeRepository, interval); + Duration interval, + Metric metric) { + super(nodeRepository, interval, metric); this.autoscaler = new Autoscaler(metricsDb, nodeRepository); } @Override - protected void maintain() { - if ( ! nodeRepository().zone().environment().isProduction()) return; + protected boolean maintain() { + boolean success = true; + if ( ! nodeRepository().zone().environment().isProduction()) return success; activeNodesByApplication().forEach((applicationId, nodes) -> suggest(applicationId, nodes)); + return success; } private void suggest(ApplicationId application, List<Node> applicationNodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 90c3a277080..20258e7947b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -56,15 +56,16 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { Metric metric, Duration interval, int maxIterations) { - super(nodeRepository, interval); + super(nodeRepository, interval, metric); this.deployer = deployer; this.metric = metric; this.maxIterations = maxIterations; } @Override - protected void maintain() { - if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return; + protected boolean maintain() { + boolean success = true; + if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return success; CapacityChecker capacityChecker = new CapacityChecker(nodeRepository()); @@ -89,6 +90,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } metric.set("spareHostCapacity", spareHostCapacity, null); } + return success; } private boolean execute(List<Move> mitigation) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java index ba859655ab7..cecedb0e909 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java @@ -233,7 +233,8 @@ public class DynamicProvisioningMaintainerTest { this.maintainer = new DynamicProvisioningMaintainer(nodeRepository, Duration.ofDays(1), hostProvisioner, - flagSource); + flagSource, + new TestMetric()); } private DynamicProvisioningTester addInitialNodes() { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java index ed6f31984a5..f8e21ebbfce 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java @@ -263,7 +263,7 @@ public class FailedExpirerTest { false, 0); this.provisioner = new NodeRepositoryProvisioner(nodeRepository, zone, new MockProvisionServiceProvider(), new InMemoryFlagSource()); - this.expirer = new FailedExpirer(nodeRepository, zone, clock, Duration.ofMinutes(30)); + this.expirer = new FailedExpirer(nodeRepository, zone, clock, Duration.ofMinutes(30), new TestMetric()); } public ManualClock clock() { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveAndFailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveAndFailedExpirerTest.java index 89e43f80479..3d17cbf0217 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveAndFailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveAndFailedExpirerTest.java @@ -64,7 +64,7 @@ public class InactiveAndFailedExpirerTest { // Inactive times out tester.advanceTime(Duration.ofMinutes(14)); - new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10)).run(); + new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10), new TestMetric()).run(); assertEquals(0, tester.nodeRepository().getNodes(Node.State.inactive).size()); List<Node> dirty = tester.nodeRepository().getNodes(Node.State.dirty); assertEquals(2, dirty.size()); @@ -79,7 +79,7 @@ public class InactiveAndFailedExpirerTest { // Dirty times out for the other one tester.advanceTime(Duration.ofMinutes(14)); - new DirtyExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10)).run(); + new DirtyExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10), new TestMetric()).run(); assertEquals(0, tester.nodeRepository().getNodes(NodeType.tenant, Node.State.dirty).size()); List<Node> failed = tester.nodeRepository().getNodes(NodeType.tenant, Node.State.failed); assertEquals(1, failed.size()); @@ -107,7 +107,7 @@ public class InactiveAndFailedExpirerTest { // Inactive times out and node is moved to dirty tester.advanceTime(Duration.ofMinutes(14)); - new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10)).run(); + new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10), new TestMetric()).run(); List<Node> dirty = tester.nodeRepository().getNodes(Node.State.dirty); assertEquals(2, dirty.size()); @@ -158,7 +158,7 @@ public class InactiveAndFailedExpirerTest { // Inactive times out and one node is moved to parked tester.advanceTime(Duration.ofMinutes(11)); // Trigger InactiveExpirer - new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10)).run(); + new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10), new TestMetric()).run(); assertEquals(1, tester.nodeRepository().getNodes(Node.State.parked).size()); } @@ -180,7 +180,7 @@ public class InactiveAndFailedExpirerTest { assertEquals(1, inactiveNodes.size()); // See that nodes are moved to dirty immediately. - new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10)).run(); + new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10), new TestMetric()).run(); assertEquals(0, tester.nodeRepository().getNodes(Node.State.inactive).size()); List<Node> dirty = tester.nodeRepository().getNodes(Node.State.dirty); assertEquals(1, dirty.size()); @@ -207,7 +207,7 @@ public class InactiveAndFailedExpirerTest { .map(node -> node.withWantToRetire(true, true, Agent.system, tester.clock().instant())) .collect(Collectors.toList()), () -> {}); tester.advanceTime(Duration.ofMinutes(11)); - new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10)).run(); + new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10), new TestMetric()).run(); assertEquals(2, tester.nodeRepository().getNodes(Node.State.parked).size()); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirerTest.java index a5e96369591..6c22f798fe0 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirerTest.java @@ -38,7 +38,8 @@ public class LoadBalancerExpirerTest { public void expire_inactive() { LoadBalancerExpirer expirer = new LoadBalancerExpirer(tester.nodeRepository(), Duration.ofDays(1), - tester.loadBalancerService()); + tester.loadBalancerService(), + new TestMetric()); Supplier<Map<LoadBalancerId, LoadBalancer>> loadBalancers = () -> tester.nodeRepository().database().readLoadBalancers((ignored) -> true); // Deploy two applications with a total of three load balancers @@ -103,7 +104,8 @@ public class LoadBalancerExpirerTest { public void expire_reserved() { LoadBalancerExpirer expirer = new LoadBalancerExpirer(tester.nodeRepository(), Duration.ofDays(1), - tester.loadBalancerService()); + tester.loadBalancerService(), + new TestMetric()); Supplier<Map<LoadBalancerId, LoadBalancer>> loadBalancers = () -> tester.nodeRepository().database().readLoadBalancers((ignored) -> true); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java index bae6de5a095..3ff81070516 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java @@ -26,7 +26,7 @@ public class NodeRebooterTest { var flagSource = new InMemoryFlagSource().withIntFlag(Flags.REBOOT_INTERVAL_IN_DAYS.id(), (int) rebootInterval.toDays()); var tester = new MaintenanceTester(); tester.createReadyHostNodes(15); - NodeRebooter rebooter = new NodeRebooter(tester.nodeRepository, tester.clock, flagSource); + NodeRebooter rebooter = new NodeRebooter(tester.nodeRepository, tester.clock, flagSource, new TestMetric()); assertReadyHosts(15, tester, 0L); @@ -69,7 +69,7 @@ public class NodeRebooterTest { var flagSource = new InMemoryFlagSource().withIntFlag(Flags.REBOOT_INTERVAL_IN_DAYS.id(), (int) rebootInterval.toDays()); var tester = new MaintenanceTester(); tester.createReadyHostNodes(2); - NodeRebooter rebooter = new NodeRebooter(tester.nodeRepository, tester.clock, flagSource); + NodeRebooter rebooter = new NodeRebooter(tester.nodeRepository, tester.clock, flagSource, new TestMetric()); assertReadyHosts(2, tester, 0L); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivatorTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivatorTest.java index 65c7bf13b42..218812f9a3d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivatorTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivatorTest.java @@ -34,7 +34,7 @@ public class OsUpgradeActivatorTest { @Test public void activates_upgrade() { var osVersions = tester.nodeRepository().osVersions(); - var osUpgradeActivator = new OsUpgradeActivator(tester.nodeRepository(), Duration.ofDays(1)); + var osUpgradeActivator = new OsUpgradeActivator(tester.nodeRepository(), Duration.ofDays(1), new TestMetric()); var version0 = Version.fromString("7.0"); // Create infrastructure nodes diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirerTest.java index 6ca154f5f17..bd92c2a9aa2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirerTest.java @@ -75,7 +75,7 @@ public class ReservationExpirerTest { // Reservation times out clock.advance(Duration.ofMinutes(14)); // Reserved but not used time out - new ReservationExpirer(nodeRepository, clock, Duration.ofMinutes(10)).run(); + new ReservationExpirer(nodeRepository, clock, Duration.ofMinutes(10), new TestMetric()).run(); // Assert nothing is reserved assertEquals(0, nodeRepository.getNodes(NodeType.tenant, Node.State.reserved).size()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java index b7f21eb3114..be5c7f423c7 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java @@ -25,7 +25,6 @@ import java.time.Duration; import java.util.List; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; /** * Tests the scaling suggestions maintainer integration. @@ -66,7 +65,8 @@ public class ScalingSuggestionsMaintainerTest { ScalingSuggestionsMaintainer maintainer = new ScalingSuggestionsMaintainer(tester.nodeRepository(), nodeMetricsDb, - Duration.ofMinutes(1)); + Duration.ofMinutes(1), + new TestMetric()); maintainer.maintain(); assertEquals("14 nodes with [vcpu: 6.9, memory: 5.1 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps, storage type: remote]", diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java index 3eb379b0914..845eeba972c 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java @@ -22,6 +22,7 @@ import com.yahoo.transaction.NestedTransaction; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.maintenance.ReservationExpirer; +import com.yahoo.vespa.hosted.provision.maintenance.TestMetric; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; import org.junit.Test; @@ -791,7 +792,7 @@ public class ProvisioningTest { // Over 10 minutes pass since first reservation. First set of reserved nodes are not expired tester.clock().advance(Duration.ofMinutes(8).plus(Duration.ofSeconds(1))); ReservationExpirer expirer = new ReservationExpirer(tester.nodeRepository(), tester.clock(), - Duration.ofMinutes(10)); + Duration.ofMinutes(10), new TestMetric()); expirer.run(); assertEquals("Nodes remain reserved", 4, tester.getNodes(application, Node.State.reserved).size()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java index b1ecd03aa13..81bf999a184 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java @@ -9,6 +9,7 @@ import com.yahoo.text.Utf8; import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.maintenance.OsUpgradeActivator; +import com.yahoo.vespa.hosted.provision.maintenance.TestMetric; import com.yahoo.vespa.hosted.provision.testutils.MockNodeRepository; import com.yahoo.vespa.hosted.provision.testutils.OrchestratorMock; import org.junit.After; @@ -763,7 +764,7 @@ public class NodesV2ApiTest { // Activate target var nodeRepository = (NodeRepository)tester.container().components().getComponent(MockNodeRepository.class.getName()); - var osUpgradeActivator = new OsUpgradeActivator(nodeRepository, Duration.ofDays(1)); + var osUpgradeActivator = new OsUpgradeActivator(nodeRepository, Duration.ofDays(1), new TestMetric()); osUpgradeActivator.run(); // Other node type does not return wanted OS version diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java new file mode 100644 index 00000000000..4c05d46d782 --- /dev/null +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java @@ -0,0 +1,41 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.concurrent.maintenance; + +import java.time.Clock; +import java.time.Instant; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.BiConsumer; + +/** + * Tracks and forwards maintenance job metrics. + * + * @author mpolden + */ +public class JobMetrics { + + private final Clock clock; + private final BiConsumer<String, Instant> metricConsumer; + + private final Map<String, Instant> successfulRuns = new ConcurrentHashMap<>(); + + public JobMetrics(Clock clock, BiConsumer<String, Instant> metricConsumer) { + this.clock = Objects.requireNonNull(clock); + this.metricConsumer = metricConsumer; + } + + /** Record successful run of given job */ + public void recordSuccessOf(String job) { + successfulRuns.put(job, clock.instant()); + } + + /** Forward metrics for given job to metric consumer */ + public void forward(String job) { + Instant lastSuccess = successfulRuns.get(job); + if (lastSuccess != null) { + metricConsumer.accept(job, lastSuccess); + } + } + +} diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java index 9c40e5ec54f..0385c27536d 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java @@ -26,17 +26,19 @@ public abstract class Maintainer implements Runnable, AutoCloseable { private final String name; private final JobControl jobControl; + private final JobMetrics jobMetrics; private final Duration interval; private final ScheduledExecutorService service; - public Maintainer(String name, Duration interval, Instant startedAt, JobControl jobControl, List<String> clusterHostnames) { - this(name, interval, staggeredDelay(interval, startedAt, HostName.getLocalhost(), clusterHostnames), jobControl); + public Maintainer(String name, Duration interval, Instant startedAt, JobControl jobControl, JobMetrics jobMetrics, List<String> clusterHostnames) { + this(name, interval, staggeredDelay(interval, startedAt, HostName.getLocalhost(), clusterHostnames), jobControl, jobMetrics); } - public Maintainer(String name, Duration interval, Duration initialDelay, JobControl jobControl) { + public Maintainer(String name, Duration interval, Duration initialDelay, JobControl jobControl, JobMetrics jobMetrics) { this.name = name; this.interval = requireInterval(interval); this.jobControl = Objects.requireNonNull(jobControl); + this.jobMetrics = Objects.requireNonNull(jobMetrics); service = new ScheduledThreadPoolExecutor(1, r -> new Thread(r, name() + "-worker")); service.scheduleAtFixedRate(this, initialDelay.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS); jobControl.started(name(), this); @@ -72,8 +74,8 @@ public abstract class Maintainer implements Runnable, AutoCloseable { @Override public final String toString() { return name(); } - /** Called once each time this maintenance job should run */ - protected abstract void maintain(); + /** Called once each time this maintenance job should run. Returns whether the maintenance run was succesful */ + protected abstract boolean maintain(); /** Returns the interval at which this job is set to run */ protected Duration interval() { return interval; } @@ -82,7 +84,12 @@ public abstract class Maintainer implements Runnable, AutoCloseable { @SuppressWarnings("unused") public final void lockAndMaintain() { try (var lock = jobControl.lockJob(name())) { - maintain(); + try { + if (maintain()) jobMetrics.recordSuccessOf(name()); + } finally { + // Always forward metrics + jobMetrics.forward(name()); + } } } diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlStateMock.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlStateMock.java new file mode 100644 index 00000000000..28c701a67db --- /dev/null +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlStateMock.java @@ -0,0 +1,35 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.concurrent.maintenance; + +import com.yahoo.transaction.Mutex; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * @author mpolden + */ +class JobControlStateMock implements JobControlState { + + private final Set<String> inactiveJobs = new HashSet<>(); + + @Override + public Set<String> readInactiveJobs() { + return Collections.unmodifiableSet(inactiveJobs); + } + + @Override + public Mutex lockMaintenanceJob(String job) { + return () -> {}; + } + + public void setActive(String job, boolean active) { + if (active) { + inactiveJobs.remove(job); + } else { + inactiveJobs.add(job); + } + } + +} diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java index 0640ab2835a..a0ca9b529c5 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java @@ -1,15 +1,8 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; -import com.yahoo.transaction.Mutex; import org.junit.Test; -import java.time.Duration; -import java.time.Instant; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -21,18 +14,13 @@ public class JobControlTest { @Test public void testJobControl() { - MockJobControlState state = new MockJobControlState(); + JobControlStateMock state = new JobControlStateMock(); JobControl jobControl = new JobControl(state); - MockMaintainer maintainer1 = new MockMaintainer(); - MockMaintainer maintainer2 = new MockMaintainer(); - assertTrue(jobControl.jobs().isEmpty()); - String job1 = "Job1"; String job2 = "Job2"; - - jobControl.started(job1, maintainer1); - jobControl.started(job2, maintainer2); + TestMaintainer maintainer1 = new TestMaintainer(job1, jobControl); + TestMaintainer maintainer2 = new TestMaintainer(job2, jobControl); assertEquals(2, jobControl.jobs().size()); assertTrue(jobControl.jobs().contains(job1)); assertTrue(jobControl.jobs().contains(job2)); @@ -59,79 +47,36 @@ public class JobControlTest { // Run jobs on-demand jobControl.run(job1); jobControl.run(job1); - assertEquals(2, maintainer1.maintenanceInvocations); + assertEquals(2, maintainer1.totalRuns()); jobControl.run(job2); - assertEquals(1, maintainer2.maintenanceInvocations); + assertEquals(1, maintainer2.totalRuns()); // Running jobs on-demand ignores inactive flag state.setActive(job1, false); jobControl.run(job1); - assertEquals(3, maintainer1.maintenanceInvocations); + assertEquals(3, maintainer1.totalRuns()); } @Test public void testJobControlMayDeactivateJobs() { - MockJobControlState state = new MockJobControlState(); + JobControlStateMock state = new JobControlStateMock(); JobControl jobControl = new JobControl(state); - MockMaintainer mockMaintainer = new MockMaintainer(jobControl); + TestMaintainer mockMaintainer = new TestMaintainer(null, jobControl); - assertTrue(jobControl.jobs().contains("MockMaintainer")); + assertTrue(jobControl.jobs().contains("TestMaintainer")); - assertEquals(0, mockMaintainer.maintenanceInvocations); + assertEquals(0, mockMaintainer.totalRuns()); mockMaintainer.run(); - assertEquals(1, mockMaintainer.maintenanceInvocations); + assertEquals(1, mockMaintainer.totalRuns()); - state.setActive("MockMaintainer", false); + state.setActive("TestMaintainer", false); mockMaintainer.run(); - assertEquals(1, mockMaintainer.maintenanceInvocations); + assertEquals(1, mockMaintainer.totalRuns()); - state.setActive("MockMaintainer", true); + state.setActive("TestMaintainer", true); mockMaintainer.run(); - assertEquals(2, mockMaintainer.maintenanceInvocations); - } - - private static class MockJobControlState implements JobControlState { - - private final Set<String> inactiveJobs = new HashSet<>(); - - @Override - public Set<String> readInactiveJobs() { - return new HashSet<>(inactiveJobs); - } - - @Override - public Mutex lockMaintenanceJob(String job) { - return () -> {}; - } - - public void setActive(String job, boolean active) { - if (active) { - inactiveJobs.remove(job); - } else { - inactiveJobs.add(job); - } - } - - } - - private static class MockMaintainer extends Maintainer { - - int maintenanceInvocations = 0; - - private MockMaintainer(JobControl jobControl) { - super(null, Duration.ofHours(1), Instant.now(), jobControl, List.of()); - } - - private MockMaintainer() { - this(new JobControl(new MockJobControlState())); - } - - @Override - protected void maintain() { - maintenanceInvocations++; - } - + assertEquals(2, mockMaintainer.totalRuns()); } } diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java index 820d1fc3d1d..47ed010e95e 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java @@ -1,13 +1,16 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; +import com.yahoo.test.ManualClock; import org.junit.Test; import java.time.Duration; import java.time.Instant; import java.util.List; +import java.util.concurrent.atomic.AtomicReference; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; /** * @author freva @@ -36,4 +39,39 @@ public class MaintainerTest { assertEquals(300, Maintainer.staggeredDelay(interval, now, "cfg0", cluster).toMillis()); } + @Test + public void success_metric() { + ManualClock clock = new ManualClock(); + AtomicReference<Instant> lastSuccess = new AtomicReference<>(); + JobMetrics jobMetrics = new JobMetrics(clock, (job, instant) -> lastSuccess.set(instant)); + TestMaintainer maintainer = new TestMaintainer(jobMetrics); + + // Maintainer not successful yet + maintainer.successOnNextRun(false).run(); + assertNull(lastSuccess.get()); + + // Maintainer runs successfully + clock.advance(Duration.ofHours(1)); + Instant lastSuccess0 = clock.instant(); + maintainer.successOnNextRun(true).run(); + assertEquals(lastSuccess0, lastSuccess.get()); + + // Maintainer runs successfully again + clock.advance(Duration.ofHours(2)); + Instant lastSuccess1 = clock.instant(); + maintainer.run(); + assertEquals(lastSuccess1, lastSuccess.get()); + + // Maintainer throws + clock.advance(Duration.ofHours(5)); + maintainer.throwOnNextRun(true).run(); + assertEquals("Time of successful run is unchanged", lastSuccess1, lastSuccess.get()); + + // Maintainer recovers + clock.advance(Duration.ofHours(3)); + Instant lastSuccess2 = clock.instant(); + maintainer.throwOnNextRun(false).run(); + assertEquals(lastSuccess2, lastSuccess.get()); + } + } diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java new file mode 100644 index 00000000000..0ea24fb6c2b --- /dev/null +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java @@ -0,0 +1,49 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.concurrent.maintenance; + +import java.time.Clock; +import java.time.Duration; + +/** + * @author mpolden + */ +class TestMaintainer extends Maintainer { + + private int totalRuns = 0; + private boolean success = true; + private boolean throwing = false; + + public TestMaintainer(String name, JobControl jobControl, JobMetrics jobMetrics) { + super(name, Duration.ofDays(1), Duration.ofDays(1), jobControl, jobMetrics); + } + + public TestMaintainer(JobMetrics jobMetrics) { + this(null, new JobControl(new JobControlStateMock()), jobMetrics); + } + + public TestMaintainer(String name, JobControl jobControl) { + this(name, jobControl, new JobMetrics(Clock.systemUTC(), (job, instant) -> {})); + } + + public int totalRuns() { + return totalRuns; + } + + public TestMaintainer successOnNextRun(boolean success) { + this.success = success; + return this; + } + + public TestMaintainer throwOnNextRun(boolean throwing) { + this.throwing = throwing; + return this; + } + + @Override + protected boolean maintain() { + if (throwing) throw new RuntimeException("Maintenance run failed"); + totalRuns++; + return success; + } + +} |