From ba36521578a55088c6e38d50b616af85eb33cf19 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Fri, 4 Jun 2021 11:42:11 +0200 Subject: Return success factor --- .../maintenance/ApplicationPackageMaintainer.java | 10 +++--- .../maintenance/FileDistributionMaintainer.java | 4 +-- .../server/maintenance/ReindexingMaintainer.java | 11 ++++--- .../server/maintenance/SessionsMaintainer.java | 4 +-- .../server/maintenance/TenantsMaintainer.java | 6 ++-- .../ApplicationMetaDataGarbageCollector.java | 6 ++-- .../maintenance/ApplicationOwnershipConfirmer.java | 37 +++++++++++++--------- .../maintenance/ArchiveAccessMaintainer.java | 6 ++-- .../controller/maintenance/ArchiveUriUpdater.java | 4 +-- .../maintenance/ChangeRequestMaintainer.java | 4 +-- .../controller/maintenance/CloudEventReporter.java | 4 +-- .../maintenance/ContactInformationMaintainer.java | 10 +++--- .../maintenance/ContainerImageExpirer.java | 4 +-- .../maintenance/CostReportMaintainer.java | 4 +-- .../controller/maintenance/DeploymentExpirer.java | 10 +++--- .../maintenance/DeploymentIssueReporter.java | 35 ++++++++++---------- .../maintenance/DeploymentMetricsMaintainer.java | 4 +-- .../maintenance/EndpointCertificateMaintainer.java | 6 ++-- .../controller/maintenance/HostInfoUpdater.java | 4 +-- .../maintenance/InfrastructureUpgrader.java | 14 +++++--- .../hosted/controller/maintenance/JobRunner.java | 4 +-- .../controller/maintenance/MetricsReporter.java | 4 +-- .../maintenance/NameServiceDispatcher.java | 7 ++-- .../controller/maintenance/OsUpgradeScheduler.java | 4 +-- .../maintenance/OsVersionStatusUpdater.java | 6 ++-- .../maintenance/OutstandingChangeDeployer.java | 4 +-- .../controller/maintenance/ReadyJobsTrigger.java | 4 +-- .../maintenance/ReindexingTriggerer.java | 6 ++-- .../maintenance/ResourceMeterMaintainer.java | 6 ++-- .../maintenance/ResourceTagMaintainer.java | 4 +-- .../maintenance/SystemRoutingPolicyMaintainer.java | 4 +-- .../maintenance/TenantRoleMaintainer.java | 4 +-- .../maintenance/TrafficShareUpdater.java | 17 ++++++---- .../hosted/controller/maintenance/Upgrader.java | 4 +-- .../controller/maintenance/VCMRMaintainer.java | 4 +-- .../maintenance/VersionStatusUpdater.java | 6 ++-- .../maintenance/ControllerMaintainerTest.java | 4 +-- .../EndpointCertificateMaintainerTest.java | 10 +++--- .../maintenance/TrafficShareUpdaterTest.java | 10 +++--- .../maintenance/ApplicationMaintainer.java | 4 +-- .../maintenance/AutoscalingMaintainer.java | 9 +++--- .../maintenance/DynamicProvisioningMaintainer.java | 4 +-- .../hosted/provision/maintenance/Expirer.java | 4 +-- .../provision/maintenance/FailedExpirer.java | 4 +-- .../provision/maintenance/HostEncrypter.java | 4 +-- .../maintenance/InfrastructureProvisioner.java | 4 +-- .../provision/maintenance/LoadBalancerExpirer.java | 17 ++++++---- .../provision/maintenance/MetricsReporter.java | 4 +-- .../hosted/provision/maintenance/NodeFailer.java | 16 ++++++++-- .../provision/maintenance/NodeHealthTracker.java | 17 ++++++---- .../maintenance/NodeMetricsDbMaintainer.java | 21 ++++++------ .../hosted/provision/maintenance/NodeRebooter.java | 4 +-- .../provision/maintenance/OsUpgradeActivator.java | 4 +-- .../hosted/provision/maintenance/Rebalancer.java | 13 ++++---- .../provision/maintenance/RetiredExpirer.java | 11 +++++-- .../maintenance/ScalingSuggestionsMaintainer.java | 11 ++++--- .../maintenance/SpareCapacityMaintainer.java | 10 +++--- .../provision/maintenance/SwitchRebalancer.java | 10 +++--- .../maintenance/NodeMetricsDbMaintainerTest.java | 2 +- .../yahoo/concurrent/maintenance/Maintainer.java | 9 ++++-- .../concurrent/maintenance/TestMaintainer.java | 4 +-- 61 files changed, 272 insertions(+), 213 deletions(-) diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java index 5519ffc1bdc..003b4fbb345 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java @@ -48,8 +48,9 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer { } @Override - protected boolean maintain() { - boolean success = true; + protected double maintain() { + int attempts = 0; + int failures = 0; try (var fileDownloader = new FileDownloader(connectionPool, downloadDirectory)) { for (var applicationId : applicationRepository.listApplications()) { @@ -62,11 +63,12 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer { log.fine(() -> "Verifying application package file reference " + applicationPackage + " for session " + sessionId); if (applicationPackage != null) { + attempts++; if (! fileReferenceExistsOnDisk(downloadDirectory, applicationPackage)) { log.fine(() -> "Downloading missing application package for application " + applicationId + " - session " + sessionId); if (fileDownloader.getFile(applicationPackage).isEmpty()) { - success = false; + failures++; log.warning("Failed to download application package for application " + applicationId + " - session " + sessionId); continue; } @@ -75,7 +77,7 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer { } } } - return success; + return asSuccessFactor(attempts, failures); } @Override diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java index b0876fb57e8..ca8db30c21f 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java @@ -33,9 +33,9 @@ public class FileDistributionMaintainer extends ConfigServerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { applicationRepository.deleteUnusedFiledistributionReferences(fileReferencesDir, maxUnusedFileReferenceAge); - return true; + return 1.0; } } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java index 971c2c20ae9..af9ea917aaf 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java @@ -22,6 +22,7 @@ import java.util.Comparator; import java.util.Map; import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Supplier; import java.util.logging.Level; @@ -51,8 +52,9 @@ public class ReindexingMaintainer extends ConfigServerMaintainer { } @Override - protected boolean maintain() { - AtomicBoolean success = new AtomicBoolean(true); + protected double maintain() { + AtomicInteger attempts = new AtomicInteger(0); + AtomicInteger failures = new AtomicInteger(0); for (Tenant tenant : applicationRepository.tenantRepository().getAllTenants()) { ApplicationCuratorDatabase database = tenant.getApplicationRepo().database(); for (ApplicationId id : database.activeApplications()) @@ -60,6 +62,7 @@ public class ReindexingMaintainer extends ConfigServerMaintainer { .map(application -> application.getForVersionOrLatest(Optional.empty(), clock.instant())) .ifPresent(application -> { try { + attempts.incrementAndGet(); applicationRepository.modifyReindexing(id, reindexing -> { reindexing = withNewReady(reindexing, lazyGeneration(application), clock.instant()); reindexing = withOnlyCurrentData(reindexing, application); @@ -68,11 +71,11 @@ public class ReindexingMaintainer extends ConfigServerMaintainer { } catch (RuntimeException e) { log.log(Level.INFO, "Failed to update reindexing status for " + id + ": " + Exceptions.toMessageString(e)); - success.set(false); + failures.incrementAndGet(); } }); } - return success.get(); + return asSuccessFactor(attempts.get(), failures.get()); } private Supplier lazyGeneration(Application application) { diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java index 7482980e221..1f85dd4579d 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java @@ -25,7 +25,7 @@ public class SessionsMaintainer extends ConfigServerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { if (iteration % 10 == 0) log.log(Level.INFO, () -> "Running " + SessionsMaintainer.class.getSimpleName() + ", iteration " + iteration); @@ -38,7 +38,7 @@ public class SessionsMaintainer extends ConfigServerMaintainer { } iteration++; - return true; + return 1.0; } } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java index 7c01045ee72..0a7df2c9d21 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java @@ -31,12 +31,12 @@ public class TenantsMaintainer extends ConfigServerMaintainer { } @Override - protected boolean maintain() { - if ( ! applicationRepository.configserverConfig().hostedVespa()) return true; + protected double maintain() { + if ( ! applicationRepository.configserverConfig().hostedVespa()) return 1.0; Set tenants = applicationRepository.deleteUnusedTenants(ttlForUnusedTenant, clock.instant()); if (tenants.size() > 0) log.log(Level.INFO, "Deleted tenants " + tenants); - return true; + return 1.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java index 7d94a4c728f..9ec8e4d1a2d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java @@ -19,14 +19,14 @@ public class ApplicationMetaDataGarbageCollector extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { try { controller().applications().applicationStore().pruneMeta(controller().clock().instant().minus(Duration.ofDays(365))); - return true; + return 1.0; } catch (Exception e) { log.log(Level.WARNING, "Exception pruning old application meta data", e); - return false; + return 0.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java index 1f20e48edf5..69e0eb26f16 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java @@ -18,6 +18,7 @@ import com.yahoo.yolean.Exceptions; import java.time.Duration; import java.util.HashMap; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; /** @@ -39,15 +40,17 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { } @Override - protected boolean maintain() { - return confirmApplicationOwnerships() & - ensureConfirmationResponses() & - updateConfirmedApplicationOwners(); + protected double maintain() { + return ( confirmApplicationOwnerships() + + ensureConfirmationResponses() + + updateConfirmedApplicationOwners() ) + / 3; } /** File an ownership issue with the owners of all applications we know about. */ - private boolean confirmApplicationOwnerships() { - AtomicBoolean success = new AtomicBoolean(true); + private double confirmApplicationOwnerships() { + AtomicInteger attempts = new AtomicInteger(0); + AtomicInteger failures = new AtomicInteger(0); applications() .withProjectId() .withProductionDeployment() @@ -56,6 +59,7 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { .filter(application -> application.createdAt().isBefore(controller().clock().instant().minus(Duration.ofDays(90)))) .forEach(application -> { try { + attempts.incrementAndGet(); // TODO jvenstad: Makes sense to require, and run this only in main? tenantOf(application.id()).contact().flatMap(contact -> { return ownershipIssues.confirmOwnership(application.ownershipIssueId(), @@ -65,17 +69,17 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { }).ifPresent(newIssueId -> store(newIssueId, application.id())); } catch (RuntimeException e) { // Catch errors due to wrong data in the controller, or issues client timeout. - success.set(false); + failures.incrementAndGet(); log.log(Level.INFO, "Exception caught when attempting to file an issue for '" + application.id() + "': " + Exceptions.toMessageString(e)); } }); - return success.get(); + return asSuccessFactor(attempts.get(), failures.get()); } private ApplicationSummary summaryOf(TenantAndApplicationId application) { var app = applications.requireApplication(application); var metrics = new HashMap(); - for (Instance instance : app.instances().values()) + for (Instance instance : app.instances().values()) { for (var kv : instance.deployments().entrySet()) { var zone = kv.getKey(); var deploymentMetrics = kv.getValue().metrics(); @@ -83,28 +87,31 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { deploymentMetrics.queriesPerSecond(), deploymentMetrics.writesPerSecond())); } + } return new ApplicationSummary(app.id().defaultInstance(), app.activity().lastQueried(), app.activity().lastWritten(), app.latestVersion().flatMap(version -> version.buildTime()), metrics); } /** Escalate ownership issues which have not been closed before a defined amount of time has passed. */ - private boolean ensureConfirmationResponses() { - AtomicBoolean success = new AtomicBoolean(true); + private double ensureConfirmationResponses() { + AtomicInteger attempts = new AtomicInteger(0); + AtomicInteger failures = new AtomicInteger(0); for (Application application : applications()) application.ownershipIssueId().ifPresent(issueId -> { try { + attempts.incrementAndGet(); Tenant tenant = tenantOf(application.id()); ownershipIssues.ensureResponse(issueId, tenant.contact()); } catch (RuntimeException e) { - success.set(false); + failures.incrementAndGet(); log.log(Level.INFO, "Exception caught when attempting to escalate issue with id '" + issueId + "': " + Exceptions.toMessageString(e)); } }); - return success.get(); + return asSuccessFactor(attempts.get(), failures.get()); } - private boolean updateConfirmedApplicationOwners() { + private double updateConfirmedApplicationOwners() { applications() .withProjectId() .withProductionDeployment() @@ -118,7 +125,7 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer { controller().applications().store(lockedApplication.withOwner(owner))); }); }); - return true; + return 1.0; } private ApplicationList applications() { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java index 1a9889284e1..b096a853541 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java @@ -37,8 +37,7 @@ public class ArchiveAccessMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { - + protected double maintain() { // Count buckets - so we can alert if we get close to the account limit of 1000 zoneRegistry.zones().all().ids().forEach(zoneId -> metric.set(bucketCountMetricName, archiveBucketDb.buckets(zoneId).size(), @@ -59,6 +58,7 @@ public class ArchiveAccessMaintainer extends ControllerMaintainer { ) ); - return true; + return 1.0; } + } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java index d2141b097b3..ab8e5efa0bd 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java @@ -38,7 +38,7 @@ public class ArchiveUriUpdater extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { Map> tenantsByZone = new HashMap<>(); for (var application : applications.asList()) { for (var instance : application.instances().values()) { @@ -63,7 +63,7 @@ public class ArchiveUriUpdater extends ControllerMaintainer { .forEach(tenant -> nodeRepository.removeArchiveUri(zone, tenant)); }); - return true; + return 1.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java index 1f360c477b9..14e3e685a8a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java @@ -43,14 +43,14 @@ public class ChangeRequestMaintainer extends ControllerMaintainer { @Override - protected boolean maintain() { + protected double maintain() { var currentChangeRequests = pruneOldChangeRequests(); var changeRequests = changeRequestClient.getChangeRequests(currentChangeRequests); logger.fine(() -> "Found requests: " + changeRequests); storeChangeRequests(changeRequests); - return true; + return 1.0; } private void storeChangeRequests(List changeRequests) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java index d923db936cb..5acd0c63670 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java @@ -38,7 +38,7 @@ public class CloudEventReporter extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { for (var region : zonesByCloudNativeRegion.keySet()) { List events = eventFetcher.getEvents(region); for (var event : events) { @@ -48,7 +48,7 @@ public class CloudEventReporter extends ControllerMaintainer { deprovisionAffectedHosts(region, event); } } - return true; + return 1.0; } /** Deprovision any host affected by given event */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java index 7b846fa288c..5ee39f7c8f2 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java @@ -35,12 +35,14 @@ public class ContactInformationMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { TenantController tenants = controller().tenants(); - boolean success = true; + int attempts = 0; + int failures = 0; for (Tenant tenant : tenants.asList()) { log.log(FINE, () -> "Updating contact information for " + tenant); try { + attempts++; switch (tenant.type()) { case athenz: tenants.lockIfPresent(tenant.name(), LockedTenant.Athenz.class, lockedTenant -> { @@ -56,13 +58,13 @@ public class ContactInformationMaintainer extends ControllerMaintainer { throw new IllegalArgumentException("Unexpected tenant type '" + tenant.type() + "'."); } } catch (Exception e) { - success = false; + failures++; log.log(Level.WARNING, "Failed to update contact information for " + tenant + ": " + Exceptions.toMessageString(e) + ". Retrying in " + interval()); } } - return success; + return asSuccessFactor(attempts, failures); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java index ff5fc4d2051..f1574381a3d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java @@ -34,7 +34,7 @@ public class ContainerImageExpirer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { Instant now = controller().clock().instant(); VersionStatus versionStatus = controller().readVersionStatus(); List imagesToExpire = controller().serviceRegistry().containerRegistry().list().stream() @@ -44,7 +44,7 @@ public class ContainerImageExpirer extends ControllerMaintainer { log.log(Level.INFO, "Expiring " + imagesToExpire.size() + " container images: " + imagesToExpire); controller().serviceRegistry().containerRegistry().deleteAll(imagesToExpire); } - return true; + return 1.0; } /** Returns whether given image is expired */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java index 28b64b5bfe0..21cda09d92a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java @@ -31,10 +31,10 @@ public class CostReportMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { var csv = CostCalculator.resourceShareByPropertyToCsv(nodeRepository, controller(), clock, consumer.fixedAllocations()); consumer.consume(csv); - return true; + return 1.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java index e5316788802..9e3da506ca8 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java @@ -28,8 +28,9 @@ public class DeploymentExpirer extends ControllerMaintainer { } @Override - protected boolean maintain() { - boolean success = true; + protected double maintain() { + int attempts = 0; + int failures = 0; for (Application application : controller().applications().readable()) { for (Instance instance : application.instances().values()) for (Deployment deployment : instance.deployments().values()) { @@ -37,16 +38,17 @@ public class DeploymentExpirer extends ControllerMaintainer { try { log.log(Level.INFO, "Expiring deployment of " + instance.id() + " in " + deployment.zone()); + attempts++; controller().applications().deactivate(instance.id(), deployment.zone()); } catch (Exception e) { - success = false; + failures++; log.log(Level.WARNING, "Could not expire " + deployment + " of " + instance + ": " + Exceptions.toMessageString(e) + ". Retrying in " + interval()); } } } - return success; + return asSuccessFactor(attempts, failures); } /** Returns whether given deployment has expired according to its TTL */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java index a3070ef55a0..4e53e07f5af 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java @@ -21,6 +21,7 @@ import java.util.Collection; import java.util.List; import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import static com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence.broken; @@ -45,10 +46,11 @@ public class DeploymentIssueReporter extends ControllerMaintainer { } @Override - protected boolean maintain() { - return maintainDeploymentIssues(applications()) & - maintainPlatformIssue(applications()) & - escalateInactiveDeploymentIssues(applications()); + protected double maintain() { + return ( maintainDeploymentIssues(applications()) + + maintainPlatformIssue(applications()) + + escalateInactiveDeploymentIssues(applications())) + / 3; } /** Returns the applications to maintain issue status for. */ @@ -63,7 +65,7 @@ public class DeploymentIssueReporter extends ControllerMaintainer { * and store the issue id for the filed issues. Also, clear the issueIds of applications * where deployment has not failed for this amount of time. */ - private boolean maintainDeploymentIssues(List applications) { + private double maintainDeploymentIssues(List applications) { List failingApplications = controller().jobController().deploymentStatuses(ApplicationList.from(applications)) .failingApplicationChangeSince(controller().clock().instant().minus(maxFailureAge)) .mapToList(status -> status.application().id()); @@ -73,7 +75,7 @@ public class DeploymentIssueReporter extends ControllerMaintainer { fileDeploymentIssueFor(application); else store(application.id(), null); - return true; + return 1.0; } /** @@ -81,27 +83,26 @@ public class DeploymentIssueReporter extends ControllerMaintainer { * applications that have been failing the upgrade to the system version for * longer than the set grace period, or update this list if the issue already exists. */ - private boolean maintainPlatformIssue(List applications) { - boolean success = true; + private double maintainPlatformIssue(List applications) { if (controller().system() == SystemName.cd) - return success; + return 1.0; VersionStatus versionStatus = controller().readVersionStatus(); Version systemVersion = controller().systemVersion(versionStatus); if (versionStatus.version(systemVersion).confidence() != broken) - return success; + return 1.0; DeploymentStatusList statuses = controller().jobController().deploymentStatuses(ApplicationList.from(applications)); if (statuses.failingUpgradeToVersionSince(systemVersion, controller().clock().instant().minus(upgradeGracePeriod)).isEmpty()) - return success; + return 1.0; List failingApplications = statuses.failingUpgradeToVersionSince(systemVersion, controller().clock().instant()) .mapToList(status -> status.application().id().defaultInstance()); // TODO jonmv: Send only tenant and application, here and elsewhere in this. deploymentIssues.fileUnlessOpen(failingApplications, systemVersion); - return success; + return 1.0; } private Tenant ownerOf(TenantAndApplicationId applicationId) { @@ -126,21 +127,23 @@ public class DeploymentIssueReporter extends ControllerMaintainer { } /** Escalate issues for which there has been no activity for a certain amount of time. */ - private boolean escalateInactiveDeploymentIssues(Collection applications) { - AtomicBoolean success = new AtomicBoolean(true); + private double escalateInactiveDeploymentIssues(Collection applications) { + AtomicInteger attempts = new AtomicInteger(0); + AtomicInteger failures = new AtomicInteger(0); applications.forEach(application -> application.deploymentIssueId().ifPresent(issueId -> { try { + attempts.incrementAndGet(); Tenant tenant = ownerOf(application.id()); deploymentIssues.escalateIfInactive(issueId, maxInactivity, tenant.type() == Tenant.Type.athenz ? tenant.contact() : Optional.empty()); } catch (RuntimeException e) { - success.set(false); + failures.incrementAndGet(); log.log(Level.INFO, "Exception caught when attempting to escalate issue with id '" + issueId + "': " + Exceptions.toMessageString(e)); } })); - return success.get(); + return asSuccessFactor(attempts.get(), failures.get()); } private void store(TenantAndApplicationId id, IssueId issueId) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java index a8214ac8a09..20154c4f122 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java @@ -44,7 +44,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { AtomicInteger failures = new AtomicInteger(0); AtomicInteger attempts = new AtomicInteger(0); AtomicReference lastException = new AtomicReference<>(null); @@ -92,7 +92,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer { } catch (InterruptedException e) { throw new RuntimeException(e); } - return lastException.get() == null; + return asSuccessFactor(attempts.get(), failures.get()); } static DeploymentMetrics updateDeploymentMetrics(DeploymentMetrics current, List metrics) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java index 55a957f0247..85a69b0f338 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java @@ -54,7 +54,7 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { try { // In order of importance deployRefreshedCertificates(); @@ -62,10 +62,10 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer { deleteUnusedCertificates(); } catch (Exception e) { log.log(LogLevel.ERROR, "Exception caught while maintaining endpoint certificates", e); - return false; + return 0.0; } - return true; + return 1.0; } private void updateRefreshedCertificates() { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java index 83ccda422e6..10e6f9eb039 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java @@ -38,7 +38,7 @@ public class HostInfoUpdater extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { Map nodeEntities = controller().serviceRegistry().entityService().listNodes().stream() .collect(Collectors.toMap(NodeEntity::hostname, Function.identity())); @@ -62,7 +62,7 @@ public class HostInfoUpdater extends ControllerMaintainer { LOG.info("Updated information for " + hostsUpdated + " hosts(s)"); } } - return true; + return 1.0; } private static Optional buildModelName(NodeEntity nodeEntity) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java index 9859d12510a..5101de73a33 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java @@ -39,23 +39,28 @@ public abstract class InfrastructureUpgrader extends ControllerMaintain } @Override - protected boolean maintain() { - targetVersion().ifPresent(target -> upgradeAll(target, managedApplications)); - return true; + protected double maintain() { + if (targetVersion().isEmpty()) return 1.0; + return upgradeAll(targetVersion().get(), managedApplications); } /** Deploy a list of system applications until they converge on the given version */ - private void upgradeAll(VERSION target, List applications) { + private double upgradeAll(VERSION target, List applications) { + int attempts = 0; + int failures = 0; for (List zones : upgradePolicy.asList()) { boolean converged = true; for (ZoneApi zone : zones) { try { + attempts++; converged &= upgradeAll(target, applications, zone); } catch (UnreachableNodeRepositoryException e) { + failures++; converged = false; log.warning(String.format("%s: Failed to communicate with node repository in %s, continuing with next parallel zone: %s", this, zone, Exceptions.toMessageString(e))); } catch (Exception e) { + failures++; converged = false; log.warning(String.format("%s: Failed to upgrade zone: %s, continuing with next parallel zone: %s", this, zone, Exceptions.toMessageString(e))); @@ -65,6 +70,7 @@ public abstract class InfrastructureUpgrader extends ControllerMaintain break; } } + return asSuccessFactor(attempts, failures); } /** Returns whether all applications have converged to the target version in zone */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java index b84cfe5af9b..25207b733f0 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java @@ -49,10 +49,10 @@ public class JobRunner extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { jobs.active().forEach(this::advance); jobs.collectGarbage(); - return true; + return 1.0; } @Override diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index b26b94f0b28..3f65c2e49cd 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -73,7 +73,7 @@ public class MetricsReporter extends ControllerMaintainer { } @Override - public boolean maintain() { + public double maintain() { reportDeploymentMetrics(); reportRemainingRotations(); reportQueuedNameServiceRequests(); @@ -82,7 +82,7 @@ public class MetricsReporter extends ControllerMaintainer { reportAuditLog(); reportBrokenSystemVersion(versionStatus); reportTenantMetrics(); - return true; + return 1.0; } private void reportBrokenSystemVersion(VersionStatus versionStatus) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java index e57affdc15d..fe20db00e05 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java @@ -37,13 +37,12 @@ public class NameServiceDispatcher extends ControllerMaintainer { } @Override - protected boolean maintain() { - boolean success = true; + protected double maintain() { try (var lock = db.lockNameServiceQueue()) { var queue = db.readNameServiceQueue(); var instant = clock.instant(); var remaining = queue.dispatchTo(nameService, requestCount); - if (queue == remaining) return success; // Queue unchanged + if (queue == remaining) return 1.0; // Queue unchanged var dispatched = queue.first(requestCount); if (!dispatched.requests().isEmpty()) { @@ -54,7 +53,7 @@ public class NameServiceDispatcher extends ControllerMaintainer { } db.writeNameServiceQueue(remaining); } - return success; + return 1.0; } private static int requestCount(SystemName system) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java index e1618f05a7d..666d1c3b23a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java @@ -42,13 +42,13 @@ public class OsUpgradeScheduler extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { for (var cloud : supportedClouds()) { Optional newTarget = newTargetIn(cloud); if (newTarget.isEmpty()) continue; controller().upgradeOsIn(cloud, newTarget.get(), upgradeBudget(), false); } - return true; + return 1.0; } /** Returns the new target version for given cloud, if any */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java index cbd9207fda4..271dd277e1c 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java @@ -18,16 +18,16 @@ public class OsVersionStatusUpdater extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { try { OsVersionStatus newStatus = OsVersionStatus.compute(controller()); controller().updateOsVersionStatus(newStatus); - return true; + return 1.0; } catch (Exception e) { log.log(Level.WARNING, "Failed to compute OS version status: " + Exceptions.toMessageString(e) + ". Retrying in " + interval()); } - return false; + return 0.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java index a032f266de5..9d93ac719b7 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java @@ -19,13 +19,13 @@ public class OutstandingChangeDeployer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { for (Application application : ApplicationList.from(controller().applications().readable()) .withProductionDeployment() .withDeploymentSpec() .asList()) controller().applications().deploymentTrigger().triggerNewRevision(application.id()); - return true; + return 1.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java index a626f21359a..ffe958cb63a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java @@ -17,9 +17,9 @@ public class ReadyJobsTrigger extends ControllerMaintainer { } @Override - public boolean maintain() { + public double maintain() { controller().applications().deploymentTrigger().triggerReadyJobs(); - return true; + return 1.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java index 263a33cf266..0bd74c844ae 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java @@ -40,7 +40,7 @@ public class ReindexingTriggerer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { try { Instant now = controller().clock().instant(); for (Application application : controller().applications().asList()) @@ -51,11 +51,11 @@ public class ReindexingTriggerer extends ControllerMaintainer { && reindexingIsReady(controller().applications().applicationReindexing(id, deployment.zone()), now)) controller().applications().reindex(id, deployment.zone(), List.of(), List.of(), true); }); - return true; + return 1.0; } catch (RuntimeException e) { log.log(Level.WARNING, "Failed to trigger reindexing: " + Exceptions.toMessageString(e)); - return false; + return 0.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java index aed2e637e4b..39ad233ce46 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java @@ -79,19 +79,19 @@ public class ResourceMeterMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { Collection resourceSnapshots; try { resourceSnapshots = getAllResourceSnapshots(); } catch (Exception e) { log.log(Level.WARNING, "Failed to collect resource snapshots. Retrying in " + interval() + ". Error: " + Exceptions.toMessageString(e)); - return false; + return 0.0; } if (systemName.isPublic()) reportResourceSnapshots(resourceSnapshots); updateDeploymentCost(resourceSnapshots); - return true; + return 1.0; } void updateDeploymentCost(Collection resourceSnapshots) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java index c7bf7e765ed..ab988bcf0ac 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java @@ -28,7 +28,7 @@ public class ResourceTagMaintainer extends ControllerMaintainer { } @Override - public boolean maintain() { + public double maintain() { controller().zoneRegistry().zones() .ofCloud(CloudName.from("aws")) .reachable() @@ -38,7 +38,7 @@ public class ResourceTagMaintainer extends ControllerMaintainer { if (taggedResources > 0) log.log(Level.INFO, "Tagged " + taggedResources + " resources in " + zone.getId()); }); - return true; + return 1.0; } private Map> getTenantOfParentHosts(ZoneId zoneId) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java index 3b0a1fca4af..e40d772a673 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java @@ -21,14 +21,14 @@ public class SystemRoutingPolicyMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { for (var zone : controller().zoneRegistry().zones().all().ids()) { for (var application : SystemApplication.values()) { if (!application.hasEndpoint()) continue; controller().routing().policies().refresh(application.id(), DeploymentSpec.empty, zone); } } - return true; + return 1.0; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java index 1265d687850..637ae10bcc6 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java @@ -23,7 +23,7 @@ public class TenantRoleMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { var roleService = controller().serviceRegistry().roleService(); var tenants = controller().tenants().asList(); var tenantsWithRoles = tenants.stream() @@ -31,7 +31,7 @@ public class TenantRoleMaintainer extends ControllerMaintainer { .filter(this::hasProductionDeployment) .collect(Collectors.toList()); roleService.maintainRoles(tenantsWithRoles); - return true; + return 1.0; } private boolean hasProductionDeployment(TenantName tenant) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java index fbe9faa9754..0af0d01478b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java @@ -36,30 +36,34 @@ public class TrafficShareUpdater extends ControllerMaintainer { } @Override - protected boolean maintain() { - boolean success = false; + protected double maintain() { Exception lastException = null; + int attempts = 0; + int failures = 0; for (var application : applications.asList()) { for (var instance : application.instances().values()) { for (var deployment : instance.deployments().values()) { if ( ! deployment.zone().environment().isProduction()) continue; try { - success |= updateTrafficFraction(instance, deployment); + attempts++; + updateTrafficFraction(instance, deployment); } catch (Exception e) { // Some failures due to locked applications are expected and benign + failures++; lastException = e; } } } } - if ( ! success && lastException != null) // log on complete failure + double successFactor = asSuccessFactor(attempts, failures); + if ( successFactor == 0 ) log.log(Level.WARNING, "Could not update traffic share on any applications", lastException); - return success; + return successFactor; } - private boolean updateTrafficFraction(Instance instance, Deployment deployment) { + private void updateTrafficFraction(Instance instance, Deployment deployment) { double qpsInZone = deployment.metrics().queriesPerSecond(); double totalQps = instance.deployments().values().stream() .filter(i -> i.zone().environment().isProduction()) @@ -73,7 +77,6 @@ public class TrafficShareUpdater extends ControllerMaintainer { maxReadShare = currentReadShare; // distribution can be incorrect nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare); - return true; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index 8d5019904fa..2326f7b66ee 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -51,7 +51,7 @@ public class Upgrader extends ControllerMaintainer { * Schedule application upgrades. Note that this implementation must be idempotent. */ @Override - public boolean maintain() { + public double maintain() { // Determine target versions for each upgrade policy VersionStatus versionStatus = controller().readVersionStatus(); Version canaryTarget = controller().systemVersion(versionStatus); @@ -91,7 +91,7 @@ public class Upgrader extends ControllerMaintainer { upgrade(instances.with(UpgradePolicy.canary), canaryTarget, targetMajorVersion, instances.size()); defaultTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.defaultPolicy), target, targetMajorVersion, numberOfApplicationsToUpgrade())); conservativeTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.conservative), target, targetMajorVersion, numberOfApplicationsToUpgrade())); - return true; + return 1.0; } /** Returns the target versions for given confidence, one per major version in the system */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java index fedf3d90760..4cd24289676 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java @@ -57,7 +57,7 @@ public class VCMRMaintainer extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { var changeRequests = curator.readChangeRequests() .stream() .filter(shouldUpdate()) @@ -81,7 +81,7 @@ public class VCMRMaintainer extends ControllerMaintainer { }); } }); - return true; + return 1.0; } /** diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java index a3e9672b715..e4866c43f13 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java @@ -29,7 +29,7 @@ public class VersionStatusUpdater extends ControllerMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { try { VersionStatus newStatus = VersionStatus.compute(controller()); controller().updateVersionStatus(newStatus); @@ -37,12 +37,12 @@ public class VersionStatusUpdater extends ControllerMaintainer { controller().serviceRegistry().systemMonitor().reportSystemVersion(version.versionNumber(), convert(version.confidence())); }); - return true; + return 1.0; } catch (Exception e) { log.log(Level.WARNING, "Failed to compute version status: " + Exceptions.toMessageString(e) + ". Retrying in " + interval()); } - return false; + return 0.0; } static SystemMonitor.Confidence convert(VespaVersion.Confidence confidence) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java index c932ff25594..7dc5cb34818 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java @@ -74,9 +74,9 @@ public class ControllerMaintainerTest { } @Override - protected boolean maintain() { + protected double maintain() { executions.incrementAndGet(); - return success; + return success ? 1.0 : 0.0; } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java index 66bda66bbf9..ce219b8beed 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java @@ -33,7 +33,7 @@ public class EndpointCertificateMaintainerTest { @Test public void old_and_unused_cert_is_deleted() { tester.curator().writeEndpointCertificateMetadata(ApplicationId.defaultId(), exampleMetadata); - assertTrue(maintainer.maintain()); + assertEquals(1.0, maintainer.maintain(), 0.0000001); assertTrue(tester.curator().readEndpointCertificateMetadata(ApplicationId.defaultId()).isEmpty()); } @@ -41,7 +41,7 @@ public class EndpointCertificateMaintainerTest { public void unused_but_recently_used_cert_is_not_deleted() { EndpointCertificateMetadata recentlyRequestedCert = exampleMetadata.withLastRequested(tester.clock().instant().minusSeconds(3600).getEpochSecond()); tester.curator().writeEndpointCertificateMetadata(ApplicationId.defaultId(), recentlyRequestedCert); - assertTrue(maintainer.maintain()); + assertEquals(1.0, maintainer.maintain(), 0.0000001); assertEquals(Optional.of(recentlyRequestedCert), tester.curator().readEndpointCertificateMetadata(ApplicationId.defaultId())); } @@ -53,7 +53,7 @@ public class EndpointCertificateMaintainerTest { secretStore.setSecret(exampleMetadata.keyName(), "foo", 1); secretStore.setSecret(exampleMetadata.certName(), "bar", 1); - assertTrue(maintainer.maintain()); + assertEquals(1.0, maintainer.maintain(), 0.0000001); var updatedCert = Optional.of(recentlyRequestedCert.withLastRefreshed(tester.clock().instant().getEpochSecond()).withVersion(1)); @@ -77,7 +77,7 @@ public class EndpointCertificateMaintainerTest { tester.curator().writeEndpointCertificateMetadata(appId, exampleMetadata); - assertTrue(maintainer.maintain()); + assertEquals(1.0, maintainer.maintain(), 0.0000001); assertTrue(tester.curator().readEndpointCertificateMetadata(appId).isPresent()); // cert should not be deleted, the app is deployed! } @@ -97,7 +97,7 @@ public class EndpointCertificateMaintainerTest { tester.curator().writeEndpointCertificateMetadata(appId, exampleMetadata); - assertTrue(maintainer.maintain()); + assertEquals(1.0, maintainer.maintain(), 0.0000001); assertTrue(tester.curator().readEndpointCertificateMetadata(appId).isPresent()); // cert should not be deleted, the app is deployed! tester.clock().advance(Duration.ofDays(3)); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java index 2afa3a0faea..7b4882de3ff 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java @@ -39,7 +39,7 @@ public class TrafficShareUpdaterTest { // Single zone setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); deploymentMetricsMaintainer.maintain(); - assertTrue(updater.maintain()); + assertEquals(1.0, updater.maintain(), 0.0000001); assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester); // Two zones @@ -48,14 +48,14 @@ public class TrafficShareUpdaterTest { setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); setQpsMetric(0.0, application.application().id().defaultInstance(), prod2, tester); deploymentMetricsMaintainer.maintain(); - assertTrue(updater.maintain()); + assertEquals(1.0, updater.maintain(), 0.0000001); assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester); assertTrafficFraction(0.0, 1.0, application.instanceId(), prod2, tester); // - both hot setQpsMetric(53.0, application.application().id().defaultInstance(), prod1, tester); setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester); deploymentMetricsMaintainer.maintain(); - assertTrue(updater.maintain()); + assertEquals(1.0, updater.maintain(), 0.0000001); assertTrafficFraction(0.53, 1.0, application.instanceId(), prod1, tester); assertTrafficFraction(0.47, 1.0, application.instanceId(), prod2, tester); @@ -66,7 +66,7 @@ public class TrafficShareUpdaterTest { setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester); setQpsMetric(0.0, application.application().id().defaultInstance(), prod3, tester); deploymentMetricsMaintainer.maintain(); - assertTrue(updater.maintain()); + assertEquals(1.0, updater.maintain(), 0.0000001); assertTrafficFraction(0.53, 0.53, application.instanceId(), prod1, tester); assertTrafficFraction(0.47, 0.50, application.instanceId(), prod2, tester); assertTrafficFraction(0.00, 0.50, application.instanceId(), prod3, tester); @@ -75,7 +75,7 @@ public class TrafficShareUpdaterTest { setQpsMetric(25.0, application.application().id().defaultInstance(), prod2, tester); setQpsMetric(25.0, application.application().id().defaultInstance(), prod3, tester); deploymentMetricsMaintainer.maintain(); - assertTrue(updater.maintain()); + assertEquals(1.0, updater.maintain(), 0.0000001); assertTrafficFraction(0.50, 0.5, application.instanceId(), prod1, tester); assertTrafficFraction(0.25, 0.5, application.instanceId(), prod2, tester); assertTrafficFraction(0.25, 0.5, application.instanceId(), prod3, tester); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index 9ac1ca2b4c1..24160c19dfa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -41,9 +41,9 @@ public abstract class ApplicationMaintainer extends NodeRepositoryMaintainer { } @Override - protected final boolean maintain() { + protected final double maintain() { applicationsNeedingMaintenance().forEach(this::deploy); - return true; + return 1.0; } /** Returns the number of deployments that are pending execution */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index 7da6e0d3ebe..59c34a26638 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -47,14 +47,13 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - if ( ! nodeRepository().nodes().isWorking()) return false; + protected double maintain() { + if ( ! nodeRepository().nodes().isWorking()) return 0.0; - boolean success = true; - if ( ! nodeRepository().zone().environment().isAnyOf(Environment.dev, Environment.prod)) return success; + if ( ! nodeRepository().zone().environment().isAnyOf(Environment.dev, Environment.prod)) return 1.0; activeNodesByApplication().forEach(this::autoscale); - return success; + return 1.0; } private void autoscale(ApplicationId application, NodeList applicationNodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 4224667a726..d35455d70ee 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -79,13 +79,13 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { NodeList nodes = nodeRepository().nodes().list(); resumeProvisioning(nodes, lock); convergeToCapacity(nodes); } - return true; + return 1.0; } /** Resume provisioning of already provisioned hosts and their children */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java index 2443a12d198..25108425e6e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java @@ -40,7 +40,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { NodeList expired = nodeRepository().nodes().list(fromState).matching(this::isExpired); if ( ! expired.isEmpty()) { @@ -49,7 +49,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { } metric.add("expired." + fromState, expired.size(), null); - return true; + return 1.0; } protected boolean isExpired(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index e98da35aa6a..7505ce42668 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -66,7 +66,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { List remainingNodes = new ArrayList<>(nodeRepository.nodes().list(Node.State.failed) .nodeType(NodeType.tenant, NodeType.host) .asList()); @@ -78,7 +78,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { recycleIf(remainingNodes, node -> node.allocation().get().membership().cluster().isStateful() && node.history().hasEventBefore(History.Event.Type.failed, clock().instant().minus(statefulExpiry))); - return true; + return 1.0; } /** Recycle the nodes matching condition, and remove those nodes from the nodes list. */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java index caf20463f60..80f74a011c0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java @@ -43,7 +43,7 @@ public class HostEncrypter extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { Instant now = nodeRepository().clock().instant(); NodeList allNodes = nodeRepository().nodes().list(); for (var nodeType : NodeType.values()) { @@ -51,7 +51,7 @@ public class HostEncrypter extends NodeRepositoryMaintainer { if (upgradingVespa(allNodes, nodeType)) continue; unencryptedHosts(allNodes, nodeType).forEach(host -> encrypt(host, now)); } - return true; + return 1.0; } /** Returns whether any node of given type is currently upgrading its Vespa version */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java index e317333135c..d9f5ea6a7a9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java @@ -39,9 +39,9 @@ public class InfrastructureProvisioner extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { infraDeployer.activateAllSupportedInfraApplications(false); - return true; + return 1.0; } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java index 10069fd1a18..ac9d8d6671a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.google.common.collect.Sets; import com.yahoo.jdisc.Metric; +import com.yahoo.lang.MutableInteger; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.lb.LoadBalancer; @@ -54,9 +55,9 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { expireReserved(); - return removeInactive() & pruneReals(); + return ( removeInactive() + pruneReals() ) / 2; } /** Move reserved load balancer that have expired to inactive */ @@ -68,7 +69,8 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } /** Deprovision inactive load balancers that have expired */ - private boolean removeInactive() { + private double removeInactive() { + MutableInteger attempts = new MutableInteger(0); var failed = new ArrayList(); var lastException = new AtomicReference(); var expiry = nodeRepository().clock().instant().minus(inactiveExpiry); @@ -76,6 +78,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { lb.changedAt().isBefore(expiry) && allocatedNodes(lb.id()).isEmpty(), lb -> { try { + attempts.add(1); log.log(Level.INFO, () -> "Removing expired inactive load balancer " + lb.id()); service.remove(lb.id().application(), lb.id().cluster()); db.removeLoadBalancer(lb.id()); @@ -92,11 +95,12 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { .collect(Collectors.joining(", ")), interval())); } - return lastException.get() == null; + return asSuccessFactor(attempts.get(), failed.size()); } /** Remove reals from inactive load balancers */ - private boolean pruneReals() { + private double pruneReals() { + var attempts = new MutableInteger(0); var failed = new ArrayList(); var lastException = new AtomicReference(); patchLoadBalancers(lb -> lb.state() == State.inactive, lb -> { @@ -107,6 +111,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { reals.removeIf(real -> !allocatedNodes.contains(real.hostname().value())); if (reals.equals(lb.instance().get().reals())) return; // Nothing to remove try { + attempts.add(1); LOG.log(Level.INFO, () -> "Removing reals from inactive load balancer " + lb.id() + ": " + Sets.difference(lb.instance().get().reals(), reals)); service.create(new LoadBalancerSpec(lb.id().application(), lb.id().cluster(), reals), true); db.writeLoadBalancer(lb.with(lb.instance().map(instance -> instance.withReals(reals)))); @@ -124,7 +129,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { interval()), lastException.get()); } - return lastException.get() == null; + return asSuccessFactor(attempts.get(), failed.size()); } /** Patch load balancers matching given filter, while holding lock */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 85437b3e78a..3990c5099eb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -65,7 +65,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { } @Override - public boolean maintain() { + public double maintain() { NodeList nodes = nodeRepository().nodes().list(); ServiceModel serviceModel = serviceMonitor.getServiceModelSnapshot(); @@ -80,7 +80,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { updateRepairTicketMetrics(nodes); updateAllocationMetrics(nodes); updateExclusiveSwitchMetrics(nodes); - return true; + return 1.0; } private void updateAllocationMetrics(NodeList nodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index effa41dc69f..f16459ee8b9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -72,17 +72,21 @@ public class NodeFailer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - if ( ! nodeRepository().nodes().isWorking()) return false; + protected double maintain() { + if ( ! nodeRepository().nodes().isWorking()) return 0.0; + int attempts = 0; + int failures = 0; int throttledHostFailures = 0; int throttledNodeFailures = 0; // Ready nodes try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { for (Map.Entry entry : getReadyNodesByFailureReason().entrySet()) { + attempts++; Node node = entry.getKey(); if (throttle(node)) { + failures++; if (node.type().isHost()) throttledHostFailures++; else @@ -96,10 +100,12 @@ public class NodeFailer extends NodeRepositoryMaintainer { // Active nodes for (Map.Entry entry : getActiveNodesByFailureReason().entrySet()) { + attempts++; Node node = entry.getKey(); if (!failAllowedFor(node.type())) continue; if (throttle(node)) { + failures++; if (node.type().isHost()) throttledHostFailures++; else @@ -116,11 +122,15 @@ public class NodeFailer extends NodeRepositoryMaintainer { if ( ! activeNodes.childrenOf(host).isEmpty()) continue; Optional locked = Optional.empty(); try { + attempts++; locked = nodeRepository().nodes().lockAndGet(host); if (locked.isEmpty()) continue; nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer, "Host should be failed and have no tenant nodes"); } + catch (Exception e) { + failures++; + } finally { locked.ifPresent(NodeMutex::close); } @@ -130,7 +140,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { metric.set(throttlingActiveMetric, throttlingActive, null); metric.set(throttledHostFailuresMetric, throttledHostFailures, null); metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); - return throttlingActive == 0; + return asSuccessFactor(attempts, failures); } private Map getReadyNodesByFailureReason() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index fe2fb5229f9..37969a30b81 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -5,6 +5,7 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ApplicationLockException; import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.jdisc.Metric; +import com.yahoo.lang.MutableInteger; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; @@ -48,13 +49,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - updateReadyNodeLivenessEvents(); - updateActiveNodeDownState(); - return true; + protected double maintain() { + return ( updateReadyNodeLivenessEvents() + updateActiveNodeDownState() ) / 2; } - private void updateReadyNodeLivenessEvents() { + private double updateReadyNodeLivenessEvents() { // Update node last request events through ZooKeeper to collect request to all config servers. // We do this here ("lazily") to avoid writing to zk for each config request. try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { @@ -69,13 +68,16 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } } } + return 1.0; } /** * If the node is down (see {@link #allDown}), and there is no "down" history record, we add it. * Otherwise we remove any "down" history record. */ - private void updateActiveNodeDownState() { + private double updateActiveNodeDownState() { + var attempts = new MutableInteger(0); + var failures = new MutableInteger(0); NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { Optional node = activeNodes.node(hostname.toString()); @@ -90,6 +92,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { try (var lock = nodeRepository().nodes().lock(owner)) { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation + attempts.add(1); if (isDown) { recordAsDown(node.get(), lock); } else { @@ -98,8 +101,10 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } catch (ApplicationLockException e) { // Fine, carry on with other nodes. We'll try updating this one in the next run log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e)); + failures.add(1); } }); + return asSuccessFactor(attempts.get(), failures.get()); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java index 1ea4577f7fe..d671900d08c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java @@ -33,19 +33,21 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { + int attempts = 0; + var failures = new MutableInteger(0); try { - var warnings = new MutableInteger(0); Set applications = activeNodesByApplication().keySet(); - if (applications.isEmpty()) return true; + if (applications.isEmpty()) return 1.0; long pauseMs = interval().toMillis() / applications.size() - 1; // spread requests over interval int done = 0; for (ApplicationId application : applications) { + attempts++; metricsFetcher.fetchMetrics(application) .whenComplete((metricsResponse, exception) -> handleResponse(metricsResponse, exception, - warnings, + failures, application)); if (++done < applications.size()) Thread.sleep(pauseMs); @@ -56,23 +58,22 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { nodeRepository().metricsDb().gc(); - // Suppress failures for manual zones for now to avoid noise - return nodeRepository().zone().environment().isManuallyDeployed() || warnings.get() == 0; + return asSuccessFactor(attempts, failures.get()); } catch (InterruptedException e) { - return false; + return asSuccessFactor(attempts, failures.get()); } } private void handleResponse(MetricsResponse response, Throwable exception, - MutableInteger warnings, + MutableInteger failures, ApplicationId application) { if (exception != null) { - if (warnings.get() < maxWarningsPerInvocation) + if (failures.get() < maxWarningsPerInvocation) log.log(Level.WARNING, "Could not update metrics for " + application + ": " + Exceptions.toMessageString(exception)); - warnings.add(1); + failures.add(1); } else if (response != null) { nodeRepository().metricsDb().addNodeMetrics(response.nodeMetrics()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java index 6ee657beadd..c282fcdb7fc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java @@ -37,7 +37,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { // Reboot candidates: Nodes in long-term states, where we know we can safely orchestrate a reboot List nodesToReboot = nodeRepository().nodes().list(Node.State.active, Node.State.ready).stream() .filter(node -> node.type().isHost()) @@ -46,7 +46,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer { if (!nodesToReboot.isEmpty()) nodeRepository().nodes().reboot(NodeListFilter.from(nodesToReboot)); - return true; + return 1.0; } private boolean shouldReboot(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java index 4eba15307cb..749603a373d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java @@ -23,13 +23,13 @@ public class OsUpgradeActivator extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { for (var nodeType : NodeType.values()) { if (!nodeType.isHost()) continue; boolean resume = canUpgradeOsOf(nodeType); nodeRepository().osVersions().resumeUpgradeOf(nodeType, resume); } - return true; + return 1.0; } /** Returns whether to allow OS upgrade of nodes of given type */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 1543506a78e..7bb748c92c9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -33,19 +33,18 @@ public class Rebalancer extends NodeMover { } @Override - protected boolean maintain() { - if ( ! nodeRepository().nodes().isWorking()) return false; + protected double maintain() { + if ( ! nodeRepository().nodes().isWorking()) return 0.0; - boolean success = true; - if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; // Rebalancing not necessary - if (nodeRepository().zone().environment().isTest()) return success; // Short lived deployments; no need to rebalance + if (nodeRepository().zone().getCloud().dynamicProvisioning()) return 1.0; // Rebalancing not necessary + if (nodeRepository().zone().environment().isTest()) return 1.0; // Short lived deployments; no need to rebalance // Work with an unlocked snapshot as this can take a long time and full consistency is not needed NodeList allNodes = nodeRepository().nodes().list(); updateSkewMetric(allNodes); - if ( ! zoneIsStable(allNodes)) return success; + if ( ! zoneIsStable(allNodes)) return 1.0; findBestMove(allNodes).execute(true, Agent.Rebalancer, deployer, metric, nodeRepository()); - return success; + return 1.0; } @Override diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index f72daf1bc2b..3f5893b368a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -48,7 +48,10 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { + int attempts = 0; + int successes = 0; + NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); Map retiredNodesByApplication = activeNodes.retired().groupingBy(node -> node.allocation().get().owner()); for (Map.Entry entry : retiredNodesByApplication.entrySet()) { @@ -57,17 +60,19 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { List nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList()); if (nodesToRemove.isEmpty()) continue; + attempts++; try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { if ( ! deployment.isValid()) continue; nodeRepository().nodes().setRemovable(application, nodesToRemove); boolean success = deployment.activate().isPresent(); - if ( ! success) return success; + if ( ! success) continue; String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", ")); log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList); + successes++; } } - return true; + return attempts == 0 ? 1.0 : ((double)successes / attempts); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java index c217580872b..888f06a5004 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java @@ -36,13 +36,16 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - if ( ! nodeRepository().zone().environment().isProduction()) return true; + protected double maintain() { + if ( ! nodeRepository().zone().environment().isProduction()) return 1.0; + int attempts = 0; int successes = 0; - for (var application : activeNodesByApplication().entrySet()) + for (var application : activeNodesByApplication().entrySet()) { + attempts++; successes += suggest(application.getKey(), application.getValue()); - return successes > 0; + } + return attempts == 0 ? 1.0 : ((double)successes / attempts); } private int suggest(ApplicationId application, NodeList applicationNodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 0307ae13b24..0589571e9d8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -66,12 +66,11 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - if ( ! nodeRepository().nodes().isWorking()) return false; + protected double maintain() { + if ( ! nodeRepository().nodes().isWorking()) return 0.0; - boolean success = true; // Don't need to maintain spare capacity in dynamically provisioned zones; can provision more on demand. - if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; + if (nodeRepository().zone().getCloud().dynamicProvisioning()) return 1.0; NodeList allNodes = nodeRepository().nodes().list(); CapacityChecker capacityChecker = new CapacityChecker(allNodes); @@ -80,6 +79,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { metric.set("overcommittedHosts", overcommittedHosts.size(), null); retireOvercommitedHosts(allNodes, overcommittedHosts); + boolean success = true; Optional failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); if (failurePath.isPresent()) { int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1; @@ -96,7 +96,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } metric.set("spareHostCapacity", spareHostCapacity, null); } - return success; + return success ? 1.0 : 0.0; } private boolean execute(List mitigation, CapacityChecker.HostFailurePath failurePath) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java index cfab980570d..44890f2f5af 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java @@ -33,14 +33,14 @@ public class SwitchRebalancer extends NodeMover { } @Override - protected boolean maintain() { - if (!nodeRepository().nodes().isWorking()) return false; - if (!nodeRepository().zone().environment().isProduction()) return true; + protected double maintain() { + if (!nodeRepository().nodes().isWorking()) return 0.0; + if (!nodeRepository().zone().environment().isProduction()) return 1.0; NodeList allNodes = nodeRepository().nodes().list(); // Lockless as strong consistency is not needed - if (!zoneIsStable(allNodes)) return true; + if (!zoneIsStable(allNodes)) return 1.0; findBestMove(allNodes).execute(false, Agent.SwitchRebalancer, deployer, metric, nodeRepository()); - return true; + return 1.0; } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java index 5b2f7ce91e8..cfe6e4d348d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java @@ -43,7 +43,7 @@ public class NodeMetricsDbMaintainerTest { fetcher, Duration.ofHours(1), new TestMetric()); - assertTrue(maintainer.maintain()); + assertEquals(maintainer.maintain(), 1.0, 0.0000001); List timeseriesList = tester.nodeRepository().metricsDb().getNodeTimeseries(Duration.ofDays(1), Set.of("host-1.yahoo.com", "host-2.yahoo.com")); assertEquals(2, timeseriesList.size()); diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java index fb2339a4334..2a9e6dda6b6 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java @@ -90,7 +90,12 @@ public abstract class Maintainer implements Runnable { * Note that this indicates whether something is wrong, so e.g if the call did nothing because it should do * nothing, 1.0 should be returned. */ - protected abstract boolean maintain(); + protected abstract double maintain(); + + /** Convenience methods to convert attempts and failures into a success factor */ + protected final double asSuccessFactor(int attempts, int failures) { + return attempts == 0 ? 1.0 : 1 - (double)failures / attempts; + } /** Returns the interval at which this job is set to run */ protected Duration interval() { return interval; } @@ -102,7 +107,7 @@ public abstract class Maintainer implements Runnable { jobMetrics.starting(name()); double successFactor = 0; try (var lock = jobControl.lockJob(name())) { - successFactor = maintain() ? 1.0 : 0.0; + successFactor = maintain(); if (successFactor > 0.0) jobMetrics.recordCompletionOf(name()); } catch (UncheckedTimeoutException e) { diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java index 44a00a37a83..7424b17cab2 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java @@ -33,10 +33,10 @@ class TestMaintainer extends Maintainer { } @Override - protected boolean maintain() { + protected double maintain() { if (exceptionToThrow != null) throw exceptionToThrow; totalRuns++; - return success; + return success ? 1.0 : 0.0; } } -- cgit v1.2.3