summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java10
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java18
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java4
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java11
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java4
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java37
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java10
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java20
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java10
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java35
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java14
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java7
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java17
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java6
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java13
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java10
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java17
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java16
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java17
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java21
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java18
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java13
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java11
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java11
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java10
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java2
-rw-r--r--vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java24
-rw-r--r--vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java24
-rw-r--r--vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java16
-rw-r--r--vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java28
-rw-r--r--vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java4
67 files changed, 378 insertions, 255 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java
index 5519ffc1bdc..003b4fbb345 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ApplicationPackageMaintainer.java
@@ -48,8 +48,9 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer {
}
@Override
- protected boolean maintain() {
- boolean success = true;
+ protected double maintain() {
+ int attempts = 0;
+ int failures = 0;
try (var fileDownloader = new FileDownloader(connectionPool, downloadDirectory)) {
for (var applicationId : applicationRepository.listApplications()) {
@@ -62,11 +63,12 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer {
log.fine(() -> "Verifying application package file reference " + applicationPackage + " for session " + sessionId);
if (applicationPackage != null) {
+ attempts++;
if (! fileReferenceExistsOnDisk(downloadDirectory, applicationPackage)) {
log.fine(() -> "Downloading missing application package for application " + applicationId + " - session " + sessionId);
if (fileDownloader.getFile(applicationPackage).isEmpty()) {
- success = false;
+ failures++;
log.warning("Failed to download application package for application " + applicationId + " - session " + sessionId);
continue;
}
@@ -75,7 +77,7 @@ public class ApplicationPackageMaintainer extends ConfigServerMaintainer {
}
}
}
- return success;
+ return asSuccessFactor(attempts, failures);
}
@Override
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java
index 4938f34131e..e0f0a4b4099 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java
@@ -35,14 +35,24 @@ public abstract class ConfigServerMaintainer extends Maintainer {
ConfigServerMaintainer(ApplicationRepository applicationRepository, Curator curator, FlagSource flagSource,
Instant now, Duration interval) {
super(null, interval, now, new JobControl(new JobControlFlags(curator, flagSource)),
- jobMetrics(applicationRepository.metric()), cluster(curator), false);
+ new ConfigServerJobMetrics(applicationRepository.metric()), cluster(curator), false);
this.applicationRepository = applicationRepository;
}
- private static JobMetrics jobMetrics(Metric metric) {
- return new JobMetrics((job, consecutiveFailures) -> {
+ private static class ConfigServerJobMetrics extends JobMetrics {
+
+ private final Metric metric;
+
+ public ConfigServerJobMetrics(Metric metric) {
+ this.metric = metric;
+ }
+
+ @Override
+ protected void recordCompletion(String job, Long consecutiveFailures, double successFactor) {
metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job)));
- });
+ metric.set("maintenance.successFactor", successFactor, metric.createContext(Map.of("job", job)));
+ }
+
}
private static class JobControlFlags implements JobControlState {
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java
index b0876fb57e8..ca8db30c21f 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/FileDistributionMaintainer.java
@@ -33,9 +33,9 @@ public class FileDistributionMaintainer extends ConfigServerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
applicationRepository.deleteUnusedFiledistributionReferences(fileReferencesDir, maxUnusedFileReferenceAge);
- return true;
+ return 1.0;
}
}
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java
index 971c2c20ae9..af9ea917aaf 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java
@@ -22,6 +22,7 @@ import java.util.Comparator;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Supplier;
import java.util.logging.Level;
@@ -51,8 +52,9 @@ public class ReindexingMaintainer extends ConfigServerMaintainer {
}
@Override
- protected boolean maintain() {
- AtomicBoolean success = new AtomicBoolean(true);
+ protected double maintain() {
+ AtomicInteger attempts = new AtomicInteger(0);
+ AtomicInteger failures = new AtomicInteger(0);
for (Tenant tenant : applicationRepository.tenantRepository().getAllTenants()) {
ApplicationCuratorDatabase database = tenant.getApplicationRepo().database();
for (ApplicationId id : database.activeApplications())
@@ -60,6 +62,7 @@ public class ReindexingMaintainer extends ConfigServerMaintainer {
.map(application -> application.getForVersionOrLatest(Optional.empty(), clock.instant()))
.ifPresent(application -> {
try {
+ attempts.incrementAndGet();
applicationRepository.modifyReindexing(id, reindexing -> {
reindexing = withNewReady(reindexing, lazyGeneration(application), clock.instant());
reindexing = withOnlyCurrentData(reindexing, application);
@@ -68,11 +71,11 @@ public class ReindexingMaintainer extends ConfigServerMaintainer {
}
catch (RuntimeException e) {
log.log(Level.INFO, "Failed to update reindexing status for " + id + ": " + Exceptions.toMessageString(e));
- success.set(false);
+ failures.incrementAndGet();
}
});
}
- return success.get();
+ return asSuccessFactor(attempts.get(), failures.get());
}
private Supplier<Long> lazyGeneration(Application application) {
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java
index 7482980e221..1f85dd4579d 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/SessionsMaintainer.java
@@ -25,7 +25,7 @@ public class SessionsMaintainer extends ConfigServerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
if (iteration % 10 == 0)
log.log(Level.INFO, () -> "Running " + SessionsMaintainer.class.getSimpleName() + ", iteration " + iteration);
@@ -38,7 +38,7 @@ public class SessionsMaintainer extends ConfigServerMaintainer {
}
iteration++;
- return true;
+ return 1.0;
}
}
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java
index 7c01045ee72..0a7df2c9d21 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/TenantsMaintainer.java
@@ -31,12 +31,12 @@ public class TenantsMaintainer extends ConfigServerMaintainer {
}
@Override
- protected boolean maintain() {
- if ( ! applicationRepository.configserverConfig().hostedVespa()) return true;
+ protected double maintain() {
+ if ( ! applicationRepository.configserverConfig().hostedVespa()) return 1.0;
Set<TenantName> tenants = applicationRepository.deleteUnusedTenants(ttlForUnusedTenant, clock.instant());
if (tenants.size() > 0) log.log(Level.INFO, "Deleted tenants " + tenants);
- return true;
+ return 1.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java
index 7d94a4c728f..9ec8e4d1a2d 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationMetaDataGarbageCollector.java
@@ -19,14 +19,14 @@ public class ApplicationMetaDataGarbageCollector extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
try {
controller().applications().applicationStore().pruneMeta(controller().clock().instant().minus(Duration.ofDays(365)));
- return true;
+ return 1.0;
}
catch (Exception e) {
log.log(Level.WARNING, "Exception pruning old application meta data", e);
- return false;
+ return 0.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java
index 1f20e48edf5..69e0eb26f16 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmer.java
@@ -18,6 +18,7 @@ import com.yahoo.yolean.Exceptions;
import java.time.Duration;
import java.util.HashMap;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
/**
@@ -39,15 +40,17 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
- return confirmApplicationOwnerships() &
- ensureConfirmationResponses() &
- updateConfirmedApplicationOwners();
+ protected double maintain() {
+ return ( confirmApplicationOwnerships() +
+ ensureConfirmationResponses() +
+ updateConfirmedApplicationOwners() )
+ / 3;
}
/** File an ownership issue with the owners of all applications we know about. */
- private boolean confirmApplicationOwnerships() {
- AtomicBoolean success = new AtomicBoolean(true);
+ private double confirmApplicationOwnerships() {
+ AtomicInteger attempts = new AtomicInteger(0);
+ AtomicInteger failures = new AtomicInteger(0);
applications()
.withProjectId()
.withProductionDeployment()
@@ -56,6 +59,7 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer {
.filter(application -> application.createdAt().isBefore(controller().clock().instant().minus(Duration.ofDays(90))))
.forEach(application -> {
try {
+ attempts.incrementAndGet();
// TODO jvenstad: Makes sense to require, and run this only in main?
tenantOf(application.id()).contact().flatMap(contact -> {
return ownershipIssues.confirmOwnership(application.ownershipIssueId(),
@@ -65,17 +69,17 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer {
}).ifPresent(newIssueId -> store(newIssueId, application.id()));
}
catch (RuntimeException e) { // Catch errors due to wrong data in the controller, or issues client timeout.
- success.set(false);
+ failures.incrementAndGet();
log.log(Level.INFO, "Exception caught when attempting to file an issue for '" + application.id() + "': " + Exceptions.toMessageString(e));
}
});
- return success.get();
+ return asSuccessFactor(attempts.get(), failures.get());
}
private ApplicationSummary summaryOf(TenantAndApplicationId application) {
var app = applications.requireApplication(application);
var metrics = new HashMap<ZoneId, ApplicationSummary.Metric>();
- for (Instance instance : app.instances().values())
+ for (Instance instance : app.instances().values()) {
for (var kv : instance.deployments().entrySet()) {
var zone = kv.getKey();
var deploymentMetrics = kv.getValue().metrics();
@@ -83,28 +87,31 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer {
deploymentMetrics.queriesPerSecond(),
deploymentMetrics.writesPerSecond()));
}
+ }
return new ApplicationSummary(app.id().defaultInstance(), app.activity().lastQueried(), app.activity().lastWritten(),
app.latestVersion().flatMap(version -> version.buildTime()), metrics);
}
/** Escalate ownership issues which have not been closed before a defined amount of time has passed. */
- private boolean ensureConfirmationResponses() {
- AtomicBoolean success = new AtomicBoolean(true);
+ private double ensureConfirmationResponses() {
+ AtomicInteger attempts = new AtomicInteger(0);
+ AtomicInteger failures = new AtomicInteger(0);
for (Application application : applications())
application.ownershipIssueId().ifPresent(issueId -> {
try {
+ attempts.incrementAndGet();
Tenant tenant = tenantOf(application.id());
ownershipIssues.ensureResponse(issueId, tenant.contact());
}
catch (RuntimeException e) {
- success.set(false);
+ failures.incrementAndGet();
log.log(Level.INFO, "Exception caught when attempting to escalate issue with id '" + issueId + "': " + Exceptions.toMessageString(e));
}
});
- return success.get();
+ return asSuccessFactor(attempts.get(), failures.get());
}
- private boolean updateConfirmedApplicationOwners() {
+ private double updateConfirmedApplicationOwners() {
applications()
.withProjectId()
.withProductionDeployment()
@@ -118,7 +125,7 @@ public class ApplicationOwnershipConfirmer extends ControllerMaintainer {
controller().applications().store(lockedApplication.withOwner(owner)));
});
});
- return true;
+ return 1.0;
}
private ApplicationList applications() {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java
index 1a9889284e1..b096a853541 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveAccessMaintainer.java
@@ -37,8 +37,7 @@ public class ArchiveAccessMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
-
+ protected double maintain() {
// Count buckets - so we can alert if we get close to the account limit of 1000
zoneRegistry.zones().all().ids().forEach(zoneId ->
metric.set(bucketCountMetricName, archiveBucketDb.buckets(zoneId).size(),
@@ -59,6 +58,7 @@ public class ArchiveAccessMaintainer extends ControllerMaintainer {
)
);
- return true;
+ return 1.0;
}
+
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java
index d2141b097b3..ab8e5efa0bd 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ArchiveUriUpdater.java
@@ -38,7 +38,7 @@ public class ArchiveUriUpdater extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
Map<ZoneId, Set<TenantName>> tenantsByZone = new HashMap<>();
for (var application : applications.asList()) {
for (var instance : application.instances().values()) {
@@ -63,7 +63,7 @@ public class ArchiveUriUpdater extends ControllerMaintainer {
.forEach(tenant -> nodeRepository.removeArchiveUri(zone, tenant));
});
- return true;
+ return 1.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java
index 1f360c477b9..14e3e685a8a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeRequestMaintainer.java
@@ -43,14 +43,14 @@ public class ChangeRequestMaintainer extends ControllerMaintainer {
@Override
- protected boolean maintain() {
+ protected double maintain() {
var currentChangeRequests = pruneOldChangeRequests();
var changeRequests = changeRequestClient.getChangeRequests(currentChangeRequests);
logger.fine(() -> "Found requests: " + changeRequests);
storeChangeRequests(changeRequests);
- return true;
+ return 1.0;
}
private void storeChangeRequests(List<ChangeRequest> changeRequests) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java
index d923db936cb..5acd0c63670 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CloudEventReporter.java
@@ -38,7 +38,7 @@ public class CloudEventReporter extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
for (var region : zonesByCloudNativeRegion.keySet()) {
List<CloudEvent> events = eventFetcher.getEvents(region);
for (var event : events) {
@@ -48,7 +48,7 @@ public class CloudEventReporter extends ControllerMaintainer {
deprovisionAffectedHosts(region, event);
}
}
- return true;
+ return 1.0;
}
/** Deprovision any host affected by given event */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java
index 7b846fa288c..5ee39f7c8f2 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContactInformationMaintainer.java
@@ -35,12 +35,14 @@ public class ContactInformationMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
TenantController tenants = controller().tenants();
- boolean success = true;
+ int attempts = 0;
+ int failures = 0;
for (Tenant tenant : tenants.asList()) {
log.log(FINE, () -> "Updating contact information for " + tenant);
try {
+ attempts++;
switch (tenant.type()) {
case athenz:
tenants.lockIfPresent(tenant.name(), LockedTenant.Athenz.class, lockedTenant -> {
@@ -56,13 +58,13 @@ public class ContactInformationMaintainer extends ControllerMaintainer {
throw new IllegalArgumentException("Unexpected tenant type '" + tenant.type() + "'.");
}
} catch (Exception e) {
- success = false;
+ failures++;
log.log(Level.WARNING, "Failed to update contact information for " + tenant + ": " +
Exceptions.toMessageString(e) + ". Retrying in " +
interval());
}
}
- return success;
+ return asSuccessFactor(attempts, failures);
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java
index ff5fc4d2051..f1574381a3d 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ContainerImageExpirer.java
@@ -34,7 +34,7 @@ public class ContainerImageExpirer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
Instant now = controller().clock().instant();
VersionStatus versionStatus = controller().readVersionStatus();
List<ContainerImage> imagesToExpire = controller().serviceRegistry().containerRegistry().list().stream()
@@ -44,7 +44,7 @@ public class ContainerImageExpirer extends ControllerMaintainer {
log.log(Level.INFO, "Expiring " + imagesToExpire.size() + " container images: " + imagesToExpire);
controller().serviceRegistry().containerRegistry().deleteAll(imagesToExpire);
}
- return true;
+ return 1.0;
}
/** Returns whether given image is expired */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java
index 03a6268397e..810c412fcc0 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java
@@ -34,7 +34,7 @@ public abstract class ControllerMaintainer extends Maintainer {
public ControllerMaintainer(Controller controller, Duration interval, String name, Set<SystemName> activeSystems) {
super(name, interval, controller.clock().instant(), controller.jobControl(),
- jobMetrics(controller.metric()), controller.curator().cluster(), true);
+ new ControllerJobMetrics(controller.metric()), controller.curator().cluster(), true);
this.controller = controller;
this.activeSystems = Set.copyOf(Objects.requireNonNull(activeSystems));
}
@@ -47,10 +47,20 @@ public abstract class ControllerMaintainer extends Maintainer {
super.run();
}
- private static JobMetrics jobMetrics(Metric metric) {
- return new JobMetrics((job, consecutiveFailures) -> {
- metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job)));
- });
+ private static class ControllerJobMetrics extends JobMetrics {
+
+ private final Metric metric;
+
+ public ControllerJobMetrics(Metric metric) {
+ this.metric = metric;
+ }
+
+ @Override
+ protected void recordCompletion(String job, Long incompleteRuns, double successFactor) {
+ metric.set("maintenance.consecutiveFailures", incompleteRuns, metric.createContext(Map.of("job", job)));
+ metric.set("maintenance.successFactor", successFactor, metric.createContext(Map.of("job", job)));
+ }
+
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java
index 28b64b5bfe0..21cda09d92a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/CostReportMaintainer.java
@@ -31,10 +31,10 @@ public class CostReportMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
var csv = CostCalculator.resourceShareByPropertyToCsv(nodeRepository, controller(), clock, consumer.fixedAllocations());
consumer.consume(csv);
- return true;
+ return 1.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java
index e5316788802..9e3da506ca8 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentExpirer.java
@@ -28,8 +28,9 @@ public class DeploymentExpirer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
- boolean success = true;
+ protected double maintain() {
+ int attempts = 0;
+ int failures = 0;
for (Application application : controller().applications().readable()) {
for (Instance instance : application.instances().values())
for (Deployment deployment : instance.deployments().values()) {
@@ -37,16 +38,17 @@ public class DeploymentExpirer extends ControllerMaintainer {
try {
log.log(Level.INFO, "Expiring deployment of " + instance.id() + " in " + deployment.zone());
+ attempts++;
controller().applications().deactivate(instance.id(), deployment.zone());
} catch (Exception e) {
- success = false;
+ failures++;
log.log(Level.WARNING, "Could not expire " + deployment + " of " + instance +
": " + Exceptions.toMessageString(e) + ". Retrying in " +
interval());
}
}
}
- return success;
+ return asSuccessFactor(attempts, failures);
}
/** Returns whether given deployment has expired according to its TTL */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
index a3070ef55a0..4e53e07f5af 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporter.java
@@ -21,6 +21,7 @@ import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import static com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence.broken;
@@ -45,10 +46,11 @@ public class DeploymentIssueReporter extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
- return maintainDeploymentIssues(applications()) &
- maintainPlatformIssue(applications()) &
- escalateInactiveDeploymentIssues(applications());
+ protected double maintain() {
+ return ( maintainDeploymentIssues(applications()) +
+ maintainPlatformIssue(applications()) +
+ escalateInactiveDeploymentIssues(applications()))
+ / 3;
}
/** Returns the applications to maintain issue status for. */
@@ -63,7 +65,7 @@ public class DeploymentIssueReporter extends ControllerMaintainer {
* and store the issue id for the filed issues. Also, clear the issueIds of applications
* where deployment has not failed for this amount of time.
*/
- private boolean maintainDeploymentIssues(List<Application> applications) {
+ private double maintainDeploymentIssues(List<Application> applications) {
List<TenantAndApplicationId> failingApplications = controller().jobController().deploymentStatuses(ApplicationList.from(applications))
.failingApplicationChangeSince(controller().clock().instant().minus(maxFailureAge))
.mapToList(status -> status.application().id());
@@ -73,7 +75,7 @@ public class DeploymentIssueReporter extends ControllerMaintainer {
fileDeploymentIssueFor(application);
else
store(application.id(), null);
- return true;
+ return 1.0;
}
/**
@@ -81,27 +83,26 @@ public class DeploymentIssueReporter extends ControllerMaintainer {
* applications that have been failing the upgrade to the system version for
* longer than the set grace period, or update this list if the issue already exists.
*/
- private boolean maintainPlatformIssue(List<Application> applications) {
- boolean success = true;
+ private double maintainPlatformIssue(List<Application> applications) {
if (controller().system() == SystemName.cd)
- return success;
+ return 1.0;
VersionStatus versionStatus = controller().readVersionStatus();
Version systemVersion = controller().systemVersion(versionStatus);
if (versionStatus.version(systemVersion).confidence() != broken)
- return success;
+ return 1.0;
DeploymentStatusList statuses = controller().jobController().deploymentStatuses(ApplicationList.from(applications));
if (statuses.failingUpgradeToVersionSince(systemVersion, controller().clock().instant().minus(upgradeGracePeriod)).isEmpty())
- return success;
+ return 1.0;
List<ApplicationId> failingApplications = statuses.failingUpgradeToVersionSince(systemVersion, controller().clock().instant())
.mapToList(status -> status.application().id().defaultInstance());
// TODO jonmv: Send only tenant and application, here and elsewhere in this.
deploymentIssues.fileUnlessOpen(failingApplications, systemVersion);
- return success;
+ return 1.0;
}
private Tenant ownerOf(TenantAndApplicationId applicationId) {
@@ -126,21 +127,23 @@ public class DeploymentIssueReporter extends ControllerMaintainer {
}
/** Escalate issues for which there has been no activity for a certain amount of time. */
- private boolean escalateInactiveDeploymentIssues(Collection<Application> applications) {
- AtomicBoolean success = new AtomicBoolean(true);
+ private double escalateInactiveDeploymentIssues(Collection<Application> applications) {
+ AtomicInteger attempts = new AtomicInteger(0);
+ AtomicInteger failures = new AtomicInteger(0);
applications.forEach(application -> application.deploymentIssueId().ifPresent(issueId -> {
try {
+ attempts.incrementAndGet();
Tenant tenant = ownerOf(application.id());
deploymentIssues.escalateIfInactive(issueId,
maxInactivity,
tenant.type() == Tenant.Type.athenz ? tenant.contact() : Optional.empty());
}
catch (RuntimeException e) {
- success.set(false);
+ failures.incrementAndGet();
log.log(Level.INFO, "Exception caught when attempting to escalate issue with id '" + issueId + "': " + Exceptions.toMessageString(e));
}
}));
- return success.get();
+ return asSuccessFactor(attempts.get(), failures.get());
}
private void store(TenantAndApplicationId id, IssueId issueId) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
index a8214ac8a09..20154c4f122 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
@@ -44,7 +44,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
AtomicInteger failures = new AtomicInteger(0);
AtomicInteger attempts = new AtomicInteger(0);
AtomicReference<Exception> lastException = new AtomicReference<>(null);
@@ -92,7 +92,7 @@ public class DeploymentMetricsMaintainer extends ControllerMaintainer {
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
- return lastException.get() == null;
+ return asSuccessFactor(attempts.get(), failures.get());
}
static DeploymentMetrics updateDeploymentMetrics(DeploymentMetrics current, List<ClusterMetrics> metrics) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java
index 55a957f0247..85a69b0f338 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java
@@ -54,7 +54,7 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
try {
// In order of importance
deployRefreshedCertificates();
@@ -62,10 +62,10 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer {
deleteUnusedCertificates();
} catch (Exception e) {
log.log(LogLevel.ERROR, "Exception caught while maintaining endpoint certificates", e);
- return false;
+ return 0.0;
}
- return true;
+ return 1.0;
}
private void updateRefreshedCertificates() {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java
index 83ccda422e6..10e6f9eb039 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/HostInfoUpdater.java
@@ -38,7 +38,7 @@ public class HostInfoUpdater extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
Map<String, NodeEntity> nodeEntities = controller().serviceRegistry().entityService().listNodes().stream()
.collect(Collectors.toMap(NodeEntity::hostname,
Function.identity()));
@@ -62,7 +62,7 @@ public class HostInfoUpdater extends ControllerMaintainer {
LOG.info("Updated information for " + hostsUpdated + " hosts(s)");
}
}
- return true;
+ return 1.0;
}
private static Optional<String> buildModelName(NodeEntity nodeEntity) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java
index 9859d12510a..5101de73a33 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/InfrastructureUpgrader.java
@@ -39,23 +39,28 @@ public abstract class InfrastructureUpgrader<VERSION> extends ControllerMaintain
}
@Override
- protected boolean maintain() {
- targetVersion().ifPresent(target -> upgradeAll(target, managedApplications));
- return true;
+ protected double maintain() {
+ if (targetVersion().isEmpty()) return 1.0;
+ return upgradeAll(targetVersion().get(), managedApplications);
}
/** Deploy a list of system applications until they converge on the given version */
- private void upgradeAll(VERSION target, List<SystemApplication> applications) {
+ private double upgradeAll(VERSION target, List<SystemApplication> applications) {
+ int attempts = 0;
+ int failures = 0;
for (List<ZoneApi> zones : upgradePolicy.asList()) {
boolean converged = true;
for (ZoneApi zone : zones) {
try {
+ attempts++;
converged &= upgradeAll(target, applications, zone);
} catch (UnreachableNodeRepositoryException e) {
+ failures++;
converged = false;
log.warning(String.format("%s: Failed to communicate with node repository in %s, continuing with next parallel zone: %s",
this, zone, Exceptions.toMessageString(e)));
} catch (Exception e) {
+ failures++;
converged = false;
log.warning(String.format("%s: Failed to upgrade zone: %s, continuing with next parallel zone: %s",
this, zone, Exceptions.toMessageString(e)));
@@ -65,6 +70,7 @@ public abstract class InfrastructureUpgrader<VERSION> extends ControllerMaintain
break;
}
}
+ return asSuccessFactor(attempts, failures);
}
/** Returns whether all applications have converged to the target version in zone */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
index b84cfe5af9b..25207b733f0 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
@@ -49,10 +49,10 @@ public class JobRunner extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
jobs.active().forEach(this::advance);
jobs.collectGarbage();
- return true;
+ return 1.0;
}
@Override
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index b26b94f0b28..3f65c2e49cd 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -73,7 +73,7 @@ public class MetricsReporter extends ControllerMaintainer {
}
@Override
- public boolean maintain() {
+ public double maintain() {
reportDeploymentMetrics();
reportRemainingRotations();
reportQueuedNameServiceRequests();
@@ -82,7 +82,7 @@ public class MetricsReporter extends ControllerMaintainer {
reportAuditLog();
reportBrokenSystemVersion(versionStatus);
reportTenantMetrics();
- return true;
+ return 1.0;
}
private void reportBrokenSystemVersion(VersionStatus versionStatus) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java
index e57affdc15d..fe20db00e05 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/NameServiceDispatcher.java
@@ -37,13 +37,12 @@ public class NameServiceDispatcher extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
- boolean success = true;
+ protected double maintain() {
try (var lock = db.lockNameServiceQueue()) {
var queue = db.readNameServiceQueue();
var instant = clock.instant();
var remaining = queue.dispatchTo(nameService, requestCount);
- if (queue == remaining) return success; // Queue unchanged
+ if (queue == remaining) return 1.0; // Queue unchanged
var dispatched = queue.first(requestCount);
if (!dispatched.requests().isEmpty()) {
@@ -54,7 +53,7 @@ public class NameServiceDispatcher extends ControllerMaintainer {
}
db.writeNameServiceQueue(remaining);
}
- return success;
+ return 1.0;
}
private static int requestCount(SystemName system) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java
index e1618f05a7d..666d1c3b23a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java
@@ -42,13 +42,13 @@ public class OsUpgradeScheduler extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
for (var cloud : supportedClouds()) {
Optional<Version> newTarget = newTargetIn(cloud);
if (newTarget.isEmpty()) continue;
controller().upgradeOsIn(cloud, newTarget.get(), upgradeBudget(), false);
}
- return true;
+ return 1.0;
}
/** Returns the new target version for given cloud, if any */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java
index cbd9207fda4..271dd277e1c 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsVersionStatusUpdater.java
@@ -18,16 +18,16 @@ public class OsVersionStatusUpdater extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
try {
OsVersionStatus newStatus = OsVersionStatus.compute(controller());
controller().updateOsVersionStatus(newStatus);
- return true;
+ return 1.0;
} catch (Exception e) {
log.log(Level.WARNING, "Failed to compute OS version status: " + Exceptions.toMessageString(e) +
". Retrying in " + interval());
}
- return false;
+ return 0.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java
index a032f266de5..9d93ac719b7 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java
@@ -19,13 +19,13 @@ public class OutstandingChangeDeployer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
for (Application application : ApplicationList.from(controller().applications().readable())
.withProductionDeployment()
.withDeploymentSpec()
.asList())
controller().applications().deploymentTrigger().triggerNewRevision(application.id());
- return true;
+ return 1.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java
index a626f21359a..ffe958cb63a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java
@@ -17,9 +17,9 @@ public class ReadyJobsTrigger extends ControllerMaintainer {
}
@Override
- public boolean maintain() {
+ public double maintain() {
controller().applications().deploymentTrigger().triggerReadyJobs();
- return true;
+ return 1.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java
index 263a33cf266..0bd74c844ae 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReindexingTriggerer.java
@@ -40,7 +40,7 @@ public class ReindexingTriggerer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
try {
Instant now = controller().clock().instant();
for (Application application : controller().applications().asList())
@@ -51,11 +51,11 @@ public class ReindexingTriggerer extends ControllerMaintainer {
&& reindexingIsReady(controller().applications().applicationReindexing(id, deployment.zone()), now))
controller().applications().reindex(id, deployment.zone(), List.of(), List.of(), true);
});
- return true;
+ return 1.0;
}
catch (RuntimeException e) {
log.log(Level.WARNING, "Failed to trigger reindexing: " + Exceptions.toMessageString(e));
- return false;
+ return 0.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java
index aed2e637e4b..39ad233ce46 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceMeterMaintainer.java
@@ -79,19 +79,19 @@ public class ResourceMeterMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
Collection<ResourceSnapshot> resourceSnapshots;
try {
resourceSnapshots = getAllResourceSnapshots();
} catch (Exception e) {
log.log(Level.WARNING, "Failed to collect resource snapshots. Retrying in " + interval() + ". Error: " +
Exceptions.toMessageString(e));
- return false;
+ return 0.0;
}
if (systemName.isPublic()) reportResourceSnapshots(resourceSnapshots);
updateDeploymentCost(resourceSnapshots);
- return true;
+ return 1.0;
}
void updateDeploymentCost(Collection<ResourceSnapshot> resourceSnapshots) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java
index c7bf7e765ed..ab988bcf0ac 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ResourceTagMaintainer.java
@@ -28,7 +28,7 @@ public class ResourceTagMaintainer extends ControllerMaintainer {
}
@Override
- public boolean maintain() {
+ public double maintain() {
controller().zoneRegistry().zones()
.ofCloud(CloudName.from("aws"))
.reachable()
@@ -38,7 +38,7 @@ public class ResourceTagMaintainer extends ControllerMaintainer {
if (taggedResources > 0)
log.log(Level.INFO, "Tagged " + taggedResources + " resources in " + zone.getId());
});
- return true;
+ return 1.0;
}
private Map<HostName, Optional<ApplicationId>> getTenantOfParentHosts(ZoneId zoneId) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java
index 3b0a1fca4af..e40d772a673 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/SystemRoutingPolicyMaintainer.java
@@ -21,14 +21,14 @@ public class SystemRoutingPolicyMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
for (var zone : controller().zoneRegistry().zones().all().ids()) {
for (var application : SystemApplication.values()) {
if (!application.hasEndpoint()) continue;
controller().routing().policies().refresh(application.id(), DeploymentSpec.empty, zone);
}
}
- return true;
+ return 1.0;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java
index 1265d687850..637ae10bcc6 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TenantRoleMaintainer.java
@@ -23,7 +23,7 @@ public class TenantRoleMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
var roleService = controller().serviceRegistry().roleService();
var tenants = controller().tenants().asList();
var tenantsWithRoles = tenants.stream()
@@ -31,7 +31,7 @@ public class TenantRoleMaintainer extends ControllerMaintainer {
.filter(this::hasProductionDeployment)
.collect(Collectors.toList());
roleService.maintainRoles(tenantsWithRoles);
- return true;
+ return 1.0;
}
private boolean hasProductionDeployment(TenantName tenant) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java
index fbe9faa9754..0af0d01478b 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java
@@ -36,30 +36,34 @@ public class TrafficShareUpdater extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
- boolean success = false;
+ protected double maintain() {
Exception lastException = null;
+ int attempts = 0;
+ int failures = 0;
for (var application : applications.asList()) {
for (var instance : application.instances().values()) {
for (var deployment : instance.deployments().values()) {
if ( ! deployment.zone().environment().isProduction()) continue;
try {
- success |= updateTrafficFraction(instance, deployment);
+ attempts++;
+ updateTrafficFraction(instance, deployment);
}
catch (Exception e) {
// Some failures due to locked applications are expected and benign
+ failures++;
lastException = e;
}
}
}
}
- if ( ! success && lastException != null) // log on complete failure
+ double successFactor = asSuccessFactor(attempts, failures);
+ if ( successFactor == 0 )
log.log(Level.WARNING, "Could not update traffic share on any applications", lastException);
- return success;
+ return successFactor;
}
- private boolean updateTrafficFraction(Instance instance, Deployment deployment) {
+ private void updateTrafficFraction(Instance instance, Deployment deployment) {
double qpsInZone = deployment.metrics().queriesPerSecond();
double totalQps = instance.deployments().values().stream()
.filter(i -> i.zone().environment().isProduction())
@@ -73,7 +77,6 @@ public class TrafficShareUpdater extends ControllerMaintainer {
maxReadShare = currentReadShare; // distribution can be incorrect
nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare);
- return true;
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
index 8d5019904fa..2326f7b66ee 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
@@ -51,7 +51,7 @@ public class Upgrader extends ControllerMaintainer {
* Schedule application upgrades. Note that this implementation must be idempotent.
*/
@Override
- public boolean maintain() {
+ public double maintain() {
// Determine target versions for each upgrade policy
VersionStatus versionStatus = controller().readVersionStatus();
Version canaryTarget = controller().systemVersion(versionStatus);
@@ -91,7 +91,7 @@ public class Upgrader extends ControllerMaintainer {
upgrade(instances.with(UpgradePolicy.canary), canaryTarget, targetMajorVersion, instances.size());
defaultTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.defaultPolicy), target, targetMajorVersion, numberOfApplicationsToUpgrade()));
conservativeTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.conservative), target, targetMajorVersion, numberOfApplicationsToUpgrade()));
- return true;
+ return 1.0;
}
/** Returns the target versions for given confidence, one per major version in the system */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java
index fedf3d90760..4cd24289676 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java
@@ -57,7 +57,7 @@ public class VCMRMaintainer extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
var changeRequests = curator.readChangeRequests()
.stream()
.filter(shouldUpdate())
@@ -81,7 +81,7 @@ public class VCMRMaintainer extends ControllerMaintainer {
});
}
});
- return true;
+ return 1.0;
}
/**
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java
index a3e9672b715..e4866c43f13 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VersionStatusUpdater.java
@@ -29,7 +29,7 @@ public class VersionStatusUpdater extends ControllerMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
try {
VersionStatus newStatus = VersionStatus.compute(controller());
controller().updateVersionStatus(newStatus);
@@ -37,12 +37,12 @@ public class VersionStatusUpdater extends ControllerMaintainer {
controller().serviceRegistry().systemMonitor().reportSystemVersion(version.versionNumber(),
convert(version.confidence()));
});
- return true;
+ return 1.0;
} catch (Exception e) {
log.log(Level.WARNING, "Failed to compute version status: " + Exceptions.toMessageString(e) +
". Retrying in " + interval());
}
- return false;
+ return 0.0;
}
static SystemMonitor.Confidence convert(VespaVersion.Confidence confidence) {
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java
index 27b4f3744e7..7dc5cb34818 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java
@@ -39,13 +39,16 @@ public class ControllerMaintainerTest {
TestControllerMaintainer maintainer = new TestControllerMaintainer(tester.controller(), SystemName.main, new AtomicInteger());
maintainer.run();
assertEquals(0L, consecutiveFailuresMetric());
+ assertEquals(1.0, successFactorMetric(), 0.0000001);
maintainer.success = false;
maintainer.run();
maintainer.run();
assertEquals(2L, consecutiveFailuresMetric());
+ assertEquals(0.0, successFactorMetric(), 0.0000001);
maintainer.success = true;
maintainer.run();
assertEquals(0, consecutiveFailuresMetric());
+ assertEquals(1.0, successFactorMetric(), 0.0000001);
}
private long consecutiveFailuresMetric() {
@@ -54,6 +57,12 @@ public class ControllerMaintainerTest {
"maintenance.consecutiveFailures").get().longValue();
}
+ private long successFactorMetric() {
+ MetricsMock metrics = (MetricsMock) tester.controller().metric();
+ return metrics.getMetric((context) -> "TestControllerMaintainer".equals(context.get("job")),
+ "maintenance.successFactor").get().longValue();
+ }
+
private static class TestControllerMaintainer extends ControllerMaintainer {
private final AtomicInteger executions;
@@ -65,9 +74,9 @@ public class ControllerMaintainerTest {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
executions.incrementAndGet();
- return success;
+ return success ? 1.0 : 0.0;
}
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java
index 66bda66bbf9..ce219b8beed 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java
@@ -33,7 +33,7 @@ public class EndpointCertificateMaintainerTest {
@Test
public void old_and_unused_cert_is_deleted() {
tester.curator().writeEndpointCertificateMetadata(ApplicationId.defaultId(), exampleMetadata);
- assertTrue(maintainer.maintain());
+ assertEquals(1.0, maintainer.maintain(), 0.0000001);
assertTrue(tester.curator().readEndpointCertificateMetadata(ApplicationId.defaultId()).isEmpty());
}
@@ -41,7 +41,7 @@ public class EndpointCertificateMaintainerTest {
public void unused_but_recently_used_cert_is_not_deleted() {
EndpointCertificateMetadata recentlyRequestedCert = exampleMetadata.withLastRequested(tester.clock().instant().minusSeconds(3600).getEpochSecond());
tester.curator().writeEndpointCertificateMetadata(ApplicationId.defaultId(), recentlyRequestedCert);
- assertTrue(maintainer.maintain());
+ assertEquals(1.0, maintainer.maintain(), 0.0000001);
assertEquals(Optional.of(recentlyRequestedCert), tester.curator().readEndpointCertificateMetadata(ApplicationId.defaultId()));
}
@@ -53,7 +53,7 @@ public class EndpointCertificateMaintainerTest {
secretStore.setSecret(exampleMetadata.keyName(), "foo", 1);
secretStore.setSecret(exampleMetadata.certName(), "bar", 1);
- assertTrue(maintainer.maintain());
+ assertEquals(1.0, maintainer.maintain(), 0.0000001);
var updatedCert = Optional.of(recentlyRequestedCert.withLastRefreshed(tester.clock().instant().getEpochSecond()).withVersion(1));
@@ -77,7 +77,7 @@ public class EndpointCertificateMaintainerTest {
tester.curator().writeEndpointCertificateMetadata(appId, exampleMetadata);
- assertTrue(maintainer.maintain());
+ assertEquals(1.0, maintainer.maintain(), 0.0000001);
assertTrue(tester.curator().readEndpointCertificateMetadata(appId).isPresent()); // cert should not be deleted, the app is deployed!
}
@@ -97,7 +97,7 @@ public class EndpointCertificateMaintainerTest {
tester.curator().writeEndpointCertificateMetadata(appId, exampleMetadata);
- assertTrue(maintainer.maintain());
+ assertEquals(1.0, maintainer.maintain(), 0.0000001);
assertTrue(tester.curator().readEndpointCertificateMetadata(appId).isPresent()); // cert should not be deleted, the app is deployed!
tester.clock().advance(Duration.ofDays(3));
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java
index 2afa3a0faea..7b4882de3ff 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java
@@ -39,7 +39,7 @@ public class TrafficShareUpdaterTest {
// Single zone
setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester);
deploymentMetricsMaintainer.maintain();
- assertTrue(updater.maintain());
+ assertEquals(1.0, updater.maintain(), 0.0000001);
assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester);
// Two zones
@@ -48,14 +48,14 @@ public class TrafficShareUpdaterTest {
setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester);
setQpsMetric(0.0, application.application().id().defaultInstance(), prod2, tester);
deploymentMetricsMaintainer.maintain();
- assertTrue(updater.maintain());
+ assertEquals(1.0, updater.maintain(), 0.0000001);
assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester);
assertTrafficFraction(0.0, 1.0, application.instanceId(), prod2, tester);
// - both hot
setQpsMetric(53.0, application.application().id().defaultInstance(), prod1, tester);
setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester);
deploymentMetricsMaintainer.maintain();
- assertTrue(updater.maintain());
+ assertEquals(1.0, updater.maintain(), 0.0000001);
assertTrafficFraction(0.53, 1.0, application.instanceId(), prod1, tester);
assertTrafficFraction(0.47, 1.0, application.instanceId(), prod2, tester);
@@ -66,7 +66,7 @@ public class TrafficShareUpdaterTest {
setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester);
setQpsMetric(0.0, application.application().id().defaultInstance(), prod3, tester);
deploymentMetricsMaintainer.maintain();
- assertTrue(updater.maintain());
+ assertEquals(1.0, updater.maintain(), 0.0000001);
assertTrafficFraction(0.53, 0.53, application.instanceId(), prod1, tester);
assertTrafficFraction(0.47, 0.50, application.instanceId(), prod2, tester);
assertTrafficFraction(0.00, 0.50, application.instanceId(), prod3, tester);
@@ -75,7 +75,7 @@ public class TrafficShareUpdaterTest {
setQpsMetric(25.0, application.application().id().defaultInstance(), prod2, tester);
setQpsMetric(25.0, application.application().id().defaultInstance(), prod3, tester);
deploymentMetricsMaintainer.maintain();
- assertTrue(updater.maintain());
+ assertEquals(1.0, updater.maintain(), 0.0000001);
assertTrafficFraction(0.50, 0.5, application.instanceId(), prod1, tester);
assertTrafficFraction(0.25, 0.5, application.instanceId(), prod2, tester);
assertTrafficFraction(0.25, 0.5, application.instanceId(), prod3, tester);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
index 9ac1ca2b4c1..24160c19dfa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
@@ -41,9 +41,9 @@ public abstract class ApplicationMaintainer extends NodeRepositoryMaintainer {
}
@Override
- protected final boolean maintain() {
+ protected final double maintain() {
applicationsNeedingMaintenance().forEach(this::deploy);
- return true;
+ return 1.0;
}
/** Returns the number of deployments that are pending execution */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
index 05eb878e1b1..3914a0c9f07 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
@@ -44,14 +44,13 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
- if ( ! nodeRepository().nodes().isWorking()) return false;
+ protected double maintain() {
+ if ( ! nodeRepository().nodes().isWorking()) return 0.0;
- boolean success = true;
- if ( ! nodeRepository().zone().environment().isAnyOf(Environment.dev, Environment.prod)) return success;
+ if ( ! nodeRepository().zone().environment().isAnyOf(Environment.dev, Environment.prod)) return 1.0;
activeNodesByApplication().forEach(this::autoscale);
- return success;
+ return 1.0;
}
private void autoscale(ApplicationId application, NodeList applicationNodes) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
index af32c285d99..0eb2038e233 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
@@ -74,13 +74,13 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
try (Mutex lock = nodeRepository().nodes().lockUnallocated()) {
NodeList nodes = nodeRepository().nodes().list();
resumeProvisioning(nodes, lock);
convergeToCapacity(nodes);
}
- return true;
+ return 1.0;
}
/** Resume provisioning of already provisioned hosts and their children */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java
index 2443a12d198..25108425e6e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java
@@ -40,7 +40,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
NodeList expired = nodeRepository().nodes().list(fromState).matching(this::isExpired);
if ( ! expired.isEmpty()) {
@@ -49,7 +49,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer {
}
metric.add("expired." + fromState, expired.size(), null);
- return true;
+ return 1.0;
}
protected boolean isExpired(Node node) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index e98da35aa6a..7505ce42668 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -66,7 +66,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
List<Node> remainingNodes = new ArrayList<>(nodeRepository.nodes().list(Node.State.failed)
.nodeType(NodeType.tenant, NodeType.host)
.asList());
@@ -78,7 +78,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
recycleIf(remainingNodes, node ->
node.allocation().get().membership().cluster().isStateful() &&
node.history().hasEventBefore(History.Event.Type.failed, clock().instant().minus(statefulExpiry)));
- return true;
+ return 1.0;
}
/** Recycle the nodes matching condition, and remove those nodes from the nodes list. */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java
index caf20463f60..80f74a011c0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java
@@ -43,7 +43,7 @@ public class HostEncrypter extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
Instant now = nodeRepository().clock().instant();
NodeList allNodes = nodeRepository().nodes().list();
for (var nodeType : NodeType.values()) {
@@ -51,7 +51,7 @@ public class HostEncrypter extends NodeRepositoryMaintainer {
if (upgradingVespa(allNodes, nodeType)) continue;
unencryptedHosts(allNodes, nodeType).forEach(host -> encrypt(host, now));
}
- return true;
+ return 1.0;
}
/** Returns whether any node of given type is currently upgrading its Vespa version */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java
index e317333135c..d9f5ea6a7a9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java
@@ -39,9 +39,9 @@ public class InfrastructureProvisioner extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
infraDeployer.activateAllSupportedInfraApplications(false);
- return true;
+ return 1.0;
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java
index 10069fd1a18..ac9d8d6671a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.google.common.collect.Sets;
import com.yahoo.jdisc.Metric;
+import com.yahoo.lang.MutableInteger;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.lb.LoadBalancer;
@@ -54,9 +55,9 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
expireReserved();
- return removeInactive() & pruneReals();
+ return ( removeInactive() + pruneReals() ) / 2;
}
/** Move reserved load balancer that have expired to inactive */
@@ -68,7 +69,8 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer {
}
/** Deprovision inactive load balancers that have expired */
- private boolean removeInactive() {
+ private double removeInactive() {
+ MutableInteger attempts = new MutableInteger(0);
var failed = new ArrayList<LoadBalancerId>();
var lastException = new AtomicReference<Exception>();
var expiry = nodeRepository().clock().instant().minus(inactiveExpiry);
@@ -76,6 +78,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer {
lb.changedAt().isBefore(expiry) &&
allocatedNodes(lb.id()).isEmpty(), lb -> {
try {
+ attempts.add(1);
log.log(Level.INFO, () -> "Removing expired inactive load balancer " + lb.id());
service.remove(lb.id().application(), lb.id().cluster());
db.removeLoadBalancer(lb.id());
@@ -92,11 +95,12 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer {
.collect(Collectors.joining(", ")),
interval()));
}
- return lastException.get() == null;
+ return asSuccessFactor(attempts.get(), failed.size());
}
/** Remove reals from inactive load balancers */
- private boolean pruneReals() {
+ private double pruneReals() {
+ var attempts = new MutableInteger(0);
var failed = new ArrayList<LoadBalancerId>();
var lastException = new AtomicReference<Exception>();
patchLoadBalancers(lb -> lb.state() == State.inactive, lb -> {
@@ -107,6 +111,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer {
reals.removeIf(real -> !allocatedNodes.contains(real.hostname().value()));
if (reals.equals(lb.instance().get().reals())) return; // Nothing to remove
try {
+ attempts.add(1);
LOG.log(Level.INFO, () -> "Removing reals from inactive load balancer " + lb.id() + ": " + Sets.difference(lb.instance().get().reals(), reals));
service.create(new LoadBalancerSpec(lb.id().application(), lb.id().cluster(), reals), true);
db.writeLoadBalancer(lb.with(lb.instance().map(instance -> instance.withReals(reals))));
@@ -124,7 +129,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer {
interval()),
lastException.get());
}
- return lastException.get() == null;
+ return asSuccessFactor(attempts.get(), failed.size());
}
/** Patch load balancers matching given filter, while holding lock */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 85437b3e78a..3990c5099eb 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -65,7 +65,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
}
@Override
- public boolean maintain() {
+ public double maintain() {
NodeList nodes = nodeRepository().nodes().list();
ServiceModel serviceModel = serviceMonitor.getServiceModelSnapshot();
@@ -80,7 +80,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
updateRepairTicketMetrics(nodes);
updateAllocationMetrics(nodes);
updateExclusiveSwitchMetrics(nodes);
- return true;
+ return 1.0;
}
private void updateAllocationMetrics(NodeList nodes) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index effa41dc69f..f16459ee8b9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -72,17 +72,21 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
- if ( ! nodeRepository().nodes().isWorking()) return false;
+ protected double maintain() {
+ if ( ! nodeRepository().nodes().isWorking()) return 0.0;
+ int attempts = 0;
+ int failures = 0;
int throttledHostFailures = 0;
int throttledNodeFailures = 0;
// Ready nodes
try (Mutex lock = nodeRepository().nodes().lockUnallocated()) {
for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) {
+ attempts++;
Node node = entry.getKey();
if (throttle(node)) {
+ failures++;
if (node.type().isHost())
throttledHostFailures++;
else
@@ -96,10 +100,12 @@ public class NodeFailer extends NodeRepositoryMaintainer {
// Active nodes
for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) {
+ attempts++;
Node node = entry.getKey();
if (!failAllowedFor(node.type())) continue;
if (throttle(node)) {
+ failures++;
if (node.type().isHost())
throttledHostFailures++;
else
@@ -116,11 +122,15 @@ public class NodeFailer extends NodeRepositoryMaintainer {
if ( ! activeNodes.childrenOf(host).isEmpty()) continue;
Optional<NodeMutex> locked = Optional.empty();
try {
+ attempts++;
locked = nodeRepository().nodes().lockAndGet(host);
if (locked.isEmpty()) continue;
nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer,
"Host should be failed and have no tenant nodes");
}
+ catch (Exception e) {
+ failures++;
+ }
finally {
locked.ifPresent(NodeMutex::close);
}
@@ -130,7 +140,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
metric.set(throttlingActiveMetric, throttlingActive, null);
metric.set(throttledHostFailuresMetric, throttledHostFailures, null);
metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
- return throttlingActive == 0;
+ return asSuccessFactor(attempts, failures);
}
private Map<Node, String> getReadyNodesByFailureReason() {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
index fe2fb5229f9..37969a30b81 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
@@ -5,6 +5,7 @@ import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ApplicationLockException;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.jdisc.Metric;
+import com.yahoo.lang.MutableInteger;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
@@ -48,13 +49,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
- updateReadyNodeLivenessEvents();
- updateActiveNodeDownState();
- return true;
+ protected double maintain() {
+ return ( updateReadyNodeLivenessEvents() + updateActiveNodeDownState() ) / 2;
}
- private void updateReadyNodeLivenessEvents() {
+ private double updateReadyNodeLivenessEvents() {
// Update node last request events through ZooKeeper to collect request to all config servers.
// We do this here ("lazily") to avoid writing to zk for each config request.
try (Mutex lock = nodeRepository().nodes().lockUnallocated()) {
@@ -69,13 +68,16 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
}
}
}
+ return 1.0;
}
/**
* If the node is down (see {@link #allDown}), and there is no "down" history record, we add it.
* Otherwise we remove any "down" history record.
*/
- private void updateActiveNodeDownState() {
+ private double updateActiveNodeDownState() {
+ var attempts = new MutableInteger(0);
+ var failures = new MutableInteger(0);
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> {
Optional<Node> node = activeNodes.node(hostname.toString());
@@ -90,6 +92,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
try (var lock = nodeRepository().nodes().lock(owner)) {
node = getNode(hostname.toString(), owner, lock); // Re-get inside lock
if (node.isEmpty()) return; // Node disappeared or changed allocation
+ attempts.add(1);
if (isDown) {
recordAsDown(node.get(), lock);
} else {
@@ -98,8 +101,10 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer {
} catch (ApplicationLockException e) {
// Fine, carry on with other nodes. We'll try updating this one in the next run
log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e));
+ failures.add(1);
}
});
+ return asSuccessFactor(attempts.get(), failures.get());
}
/**
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java
index 1ea4577f7fe..d671900d08c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java
@@ -33,19 +33,21 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
+ int attempts = 0;
+ var failures = new MutableInteger(0);
try {
- var warnings = new MutableInteger(0);
Set<ApplicationId> applications = activeNodesByApplication().keySet();
- if (applications.isEmpty()) return true;
+ if (applications.isEmpty()) return 1.0;
long pauseMs = interval().toMillis() / applications.size() - 1; // spread requests over interval
int done = 0;
for (ApplicationId application : applications) {
+ attempts++;
metricsFetcher.fetchMetrics(application)
.whenComplete((metricsResponse, exception) -> handleResponse(metricsResponse,
exception,
- warnings,
+ failures,
application));
if (++done < applications.size())
Thread.sleep(pauseMs);
@@ -56,23 +58,22 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer {
nodeRepository().metricsDb().gc();
- // Suppress failures for manual zones for now to avoid noise
- return nodeRepository().zone().environment().isManuallyDeployed() || warnings.get() == 0;
+ return asSuccessFactor(attempts, failures.get());
}
catch (InterruptedException e) {
- return false;
+ return asSuccessFactor(attempts, failures.get());
}
}
private void handleResponse(MetricsResponse response,
Throwable exception,
- MutableInteger warnings,
+ MutableInteger failures,
ApplicationId application) {
if (exception != null) {
- if (warnings.get() < maxWarningsPerInvocation)
+ if (failures.get() < maxWarningsPerInvocation)
log.log(Level.WARNING, "Could not update metrics for " + application + ": " +
Exceptions.toMessageString(exception));
- warnings.add(1);
+ failures.add(1);
}
else if (response != null) {
nodeRepository().metricsDb().addNodeMetrics(response.nodeMetrics());
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
index 6ee657beadd..c282fcdb7fc 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
@@ -37,7 +37,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
// Reboot candidates: Nodes in long-term states, where we know we can safely orchestrate a reboot
List<Node> nodesToReboot = nodeRepository().nodes().list(Node.State.active, Node.State.ready).stream()
.filter(node -> node.type().isHost())
@@ -46,7 +46,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer {
if (!nodesToReboot.isEmpty())
nodeRepository().nodes().reboot(NodeListFilter.from(nodesToReboot));
- return true;
+ return 1.0;
}
private boolean shouldReboot(Node node) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
index 0a1f6961f9f..fe5cd419b31 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
@@ -25,7 +25,7 @@ public abstract class NodeRepositoryMaintainer extends Maintainer {
public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval, Metric metric) {
super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(),
- jobMetrics(metric), nodeRepository.database().cluster(), true);
+ new NodeRepositoryJobMetrics(metric), nodeRepository.database().cluster(), true);
this.nodeRepository = nodeRepository;
}
@@ -48,10 +48,20 @@ public abstract class NodeRepositoryMaintainer extends Maintainer {
.groupingBy(node -> node.allocation().get().owner());
}
- private static JobMetrics jobMetrics(Metric metric) {
- return new JobMetrics((job, consecutiveFailures) -> {
+ private static class NodeRepositoryJobMetrics extends JobMetrics {
+
+ private final Metric metric;
+
+ public NodeRepositoryJobMetrics(Metric metric) {
+ this.metric = metric;
+ }
+
+ @Override
+ protected void recordCompletion(String job, Long consecutiveFailures, double successFactor) {
metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job)));
- });
+ metric.set("maintenance.successFactor", successFactor, metric.createContext(Map.of("job", job)));
+ }
+
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java
index 4eba15307cb..749603a373d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java
@@ -23,13 +23,13 @@ public class OsUpgradeActivator extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
for (var nodeType : NodeType.values()) {
if (!nodeType.isHost()) continue;
boolean resume = canUpgradeOsOf(nodeType);
nodeRepository().osVersions().resumeUpgradeOf(nodeType, resume);
}
- return true;
+ return 1.0;
}
/** Returns whether to allow OS upgrade of nodes of given type */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
index 1543506a78e..7bb748c92c9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
@@ -33,19 +33,18 @@ public class Rebalancer extends NodeMover<Rebalancer.Move> {
}
@Override
- protected boolean maintain() {
- if ( ! nodeRepository().nodes().isWorking()) return false;
+ protected double maintain() {
+ if ( ! nodeRepository().nodes().isWorking()) return 0.0;
- boolean success = true;
- if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; // Rebalancing not necessary
- if (nodeRepository().zone().environment().isTest()) return success; // Short lived deployments; no need to rebalance
+ if (nodeRepository().zone().getCloud().dynamicProvisioning()) return 1.0; // Rebalancing not necessary
+ if (nodeRepository().zone().environment().isTest()) return 1.0; // Short lived deployments; no need to rebalance
// Work with an unlocked snapshot as this can take a long time and full consistency is not needed
NodeList allNodes = nodeRepository().nodes().list();
updateSkewMetric(allNodes);
- if ( ! zoneIsStable(allNodes)) return success;
+ if ( ! zoneIsStable(allNodes)) return 1.0;
findBestMove(allNodes).execute(true, Agent.Rebalancer, deployer, metric, nodeRepository());
- return success;
+ return 1.0;
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
index f72daf1bc2b..3f5893b368a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
@@ -48,7 +48,10 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
+ int attempts = 0;
+ int successes = 0;
+
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
Map<ApplicationId, NodeList> retiredNodesByApplication = activeNodes.retired().groupingBy(node -> node.allocation().get().owner());
for (Map.Entry<ApplicationId, NodeList> entry : retiredNodesByApplication.entrySet()) {
@@ -57,17 +60,19 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
List<Node> nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList());
if (nodesToRemove.isEmpty()) continue;
+ attempts++;
try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) {
if ( ! deployment.isValid()) continue;
nodeRepository().nodes().setRemovable(application, nodesToRemove);
boolean success = deployment.activate().isPresent();
- if ( ! success) return success;
+ if ( ! success) continue;
String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
+ successes++;
}
}
- return true;
+ return attempts == 0 ? 1.0 : ((double)successes / attempts);
}
/**
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java
index c217580872b..888f06a5004 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java
@@ -36,13 +36,16 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
- if ( ! nodeRepository().zone().environment().isProduction()) return true;
+ protected double maintain() {
+ if ( ! nodeRepository().zone().environment().isProduction()) return 1.0;
+ int attempts = 0;
int successes = 0;
- for (var application : activeNodesByApplication().entrySet())
+ for (var application : activeNodesByApplication().entrySet()) {
+ attempts++;
successes += suggest(application.getKey(), application.getValue());
- return successes > 0;
+ }
+ return attempts == 0 ? 1.0 : ((double)successes / attempts);
}
private int suggest(ApplicationId application, NodeList applicationNodes) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
index 0307ae13b24..0589571e9d8 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
@@ -66,12 +66,11 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
}
@Override
- protected boolean maintain() {
- if ( ! nodeRepository().nodes().isWorking()) return false;
+ protected double maintain() {
+ if ( ! nodeRepository().nodes().isWorking()) return 0.0;
- boolean success = true;
// Don't need to maintain spare capacity in dynamically provisioned zones; can provision more on demand.
- if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success;
+ if (nodeRepository().zone().getCloud().dynamicProvisioning()) return 1.0;
NodeList allNodes = nodeRepository().nodes().list();
CapacityChecker capacityChecker = new CapacityChecker(allNodes);
@@ -80,6 +79,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
metric.set("overcommittedHosts", overcommittedHosts.size(), null);
retireOvercommitedHosts(allNodes, overcommittedHosts);
+ boolean success = true;
Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure();
if (failurePath.isPresent()) {
int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1;
@@ -96,7 +96,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
}
metric.set("spareHostCapacity", spareHostCapacity, null);
}
- return success;
+ return success ? 1.0 : 0.0;
}
private boolean execute(List<Move> mitigation, CapacityChecker.HostFailurePath failurePath) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
index cfab980570d..44890f2f5af 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
@@ -33,14 +33,14 @@ public class SwitchRebalancer extends NodeMover<Move> {
}
@Override
- protected boolean maintain() {
- if (!nodeRepository().nodes().isWorking()) return false;
- if (!nodeRepository().zone().environment().isProduction()) return true;
+ protected double maintain() {
+ if (!nodeRepository().nodes().isWorking()) return 0.0;
+ if (!nodeRepository().zone().environment().isProduction()) return 1.0;
NodeList allNodes = nodeRepository().nodes().list(); // Lockless as strong consistency is not needed
- if (!zoneIsStable(allNodes)) return true;
+ if (!zoneIsStable(allNodes)) return 1.0;
findBestMove(allNodes).execute(false, Agent.SwitchRebalancer, deployer, metric, nodeRepository());
- return true;
+ return 1.0;
}
@Override
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java
index 5b2f7ce91e8..cfe6e4d348d 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java
@@ -43,7 +43,7 @@ public class NodeMetricsDbMaintainerTest {
fetcher,
Duration.ofHours(1),
new TestMetric());
- assertTrue(maintainer.maintain());
+ assertEquals(maintainer.maintain(), 1.0, 0.0000001);
List<NodeTimeseries> timeseriesList = tester.nodeRepository().metricsDb().getNodeTimeseries(Duration.ofDays(1),
Set.of("host-1.yahoo.com", "host-2.yahoo.com"));
assertEquals(2, timeseriesList.size());
diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java
index fcc5b8e57a2..73a5dc77743 100644
--- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java
+++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java
@@ -2,25 +2,18 @@
package com.yahoo.concurrent.maintenance;
import java.util.concurrent.ConcurrentHashMap;
-import java.util.function.BiConsumer;
/**
* Tracks and forwards maintenance job metrics.
*
* @author mpolden
*/
-public class JobMetrics {
-
- private final BiConsumer<String, Long> metricConsumer;
+public abstract class JobMetrics {
private final ConcurrentHashMap<String, Long> incompleteRuns = new ConcurrentHashMap<>();
- public JobMetrics(BiConsumer<String, Long> metricConsumer) {
- this.metricConsumer = metricConsumer;
- }
-
- /** Record a run for given job */
- public void recordRunOf(String job) {
+ /** Record starting of a run of a job */
+ public void starting(String job) {
incompleteRuns.merge(job, 1L, Long::sum);
}
@@ -29,12 +22,17 @@ public class JobMetrics {
incompleteRuns.put(job, 0L);
}
- /** Forward metrics for given job to metric consumer */
- public void forward(String job) {
+ /**
+ * Records completion of a run of a job.
+ * This is guaranteed to always be called once whenever starting has been called.
+ */
+ public void completed(String job, double successFactor) {
Long incompleteRuns = this.incompleteRuns.get(job);
if (incompleteRuns != null) {
- metricConsumer.accept(job, incompleteRuns);
+ recordCompletion(job, incompleteRuns, successFactor);
}
}
+ protected abstract void recordCompletion(String job, Long incompleteRuns, double successFactor);
+
}
diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java
index 734c46a2819..2a9e6dda6b6 100644
--- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java
+++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java
@@ -83,8 +83,19 @@ public abstract class Maintainer implements Runnable {
@Override
public final String toString() { return name(); }
- /** Called once each time this maintenance job should run. Returns whether the maintenance run was successful */
- protected abstract boolean maintain();
+ /**
+ * Called once each time this maintenance job should run.
+ *
+ * @return the degree to which the run was successful - a number between 0 (no success), to 1 (complete success).
+ * Note that this indicates whether something is wrong, so e.g if the call did nothing because it should do
+ * nothing, 1.0 should be returned.
+ */
+ protected abstract double maintain();
+
+ /** Convenience methods to convert attempts and failures into a success factor */
+ protected final double asSuccessFactor(int attempts, int failures) {
+ return attempts == 0 ? 1.0 : 1 - (double)failures / attempts;
+ }
/** Returns the interval at which this job is set to run */
protected Duration interval() { return interval; }
@@ -93,9 +104,12 @@ public abstract class Maintainer implements Runnable {
public final void lockAndMaintain(boolean force) {
if (!force && !jobControl.isActive(name())) return;
log.log(Level.FINE, () -> "Running " + this.getClass().getSimpleName());
- jobMetrics.recordRunOf(name());
+ jobMetrics.starting(name());
+ double successFactor = 0;
try (var lock = jobControl.lockJob(name())) {
- if (maintain()) jobMetrics.recordCompletionOf(name());
+ successFactor = maintain();
+ if (successFactor > 0.0)
+ jobMetrics.recordCompletionOf(name());
} catch (UncheckedTimeoutException e) {
if (ignoreCollision) {
jobMetrics.recordCompletionOf(name());
@@ -105,7 +119,7 @@ public abstract class Maintainer implements Runnable {
} catch (Throwable e) {
log.log(Level.WARNING, this + " failed. Will retry in " + interval, e);
} finally {
- jobMetrics.forward(name());
+ jobMetrics.completed(name(), successFactor);
}
log.log(Level.FINE, () -> "Finished " + this.getClass().getSimpleName());
}
diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java
index 139a2901cd3..01560c050ff 100644
--- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java
+++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java
@@ -3,6 +3,8 @@ package com.yahoo.concurrent.maintenance;
import org.junit.Test;
+import java.util.concurrent.atomic.AtomicLong;
+
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -19,9 +21,8 @@ public class JobControlTest {
String job1 = "Job1";
String job2 = "Job2";
- JobMetrics metrics = new JobMetrics((job, instant) -> {});
- TestMaintainer maintainer1 = new TestMaintainer(job1, jobControl, metrics);
- TestMaintainer maintainer2 = new TestMaintainer(job2, jobControl, metrics);
+ TestMaintainer maintainer1 = new TestMaintainer(job1, jobControl, new NoopJobMetrics());
+ TestMaintainer maintainer2 = new TestMaintainer(job2, jobControl, new NoopJobMetrics());
assertEquals(2, jobControl.jobs().size());
assertTrue(jobControl.jobs().contains(job1));
assertTrue(jobControl.jobs().contains(job2));
@@ -62,7 +63,7 @@ public class JobControlTest {
public void testJobControlMayDeactivateJobs() {
JobControlStateMock state = new JobControlStateMock();
JobControl jobControl = new JobControl(state);
- TestMaintainer mockMaintainer = new TestMaintainer(null, jobControl, new JobMetrics((job, instant) -> {}));
+ TestMaintainer mockMaintainer = new TestMaintainer(null, jobControl, new NoopJobMetrics());
assertTrue(jobControl.jobs().contains("TestMaintainer"));
@@ -80,4 +81,11 @@ public class JobControlTest {
assertEquals(2, mockMaintainer.totalRuns());
}
+ private static class NoopJobMetrics extends JobMetrics {
+
+ @Override
+ protected void recordCompletion(String job, Long incompleteRuns, double successFactor) { }
+
+ }
+
}
diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java
index e881d4b3ff6..d2db380f4a1 100644
--- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java
+++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java
@@ -42,35 +42,45 @@ public class MaintainerTest {
@Test
public void success_metric() {
- AtomicLong consecutiveFailures = new AtomicLong();
- JobMetrics jobMetrics = new JobMetrics((job, count) -> consecutiveFailures.set(count));
+ TestJobMetrics jobMetrics = new TestJobMetrics();
TestMaintainer maintainer = new TestMaintainer(null, jobControl, jobMetrics);
// Maintainer fails twice in a row
maintainer.successOnNextRun(false).run();
- assertEquals(1, consecutiveFailures.get());
+ assertEquals(1, jobMetrics.consecutiveFailures.get());
maintainer.successOnNextRun(false).run();
- assertEquals(2, consecutiveFailures.get());
+ assertEquals(2, jobMetrics.consecutiveFailures.get());
// Maintainer runs successfully
maintainer.successOnNextRun(true).run();
- assertEquals(0, consecutiveFailures.get());
+ assertEquals(0, jobMetrics.consecutiveFailures.get());
// Maintainer runs successfully again
maintainer.run();
- assertEquals(0, consecutiveFailures.get());
+ assertEquals(0, jobMetrics.consecutiveFailures.get());
// Maintainer throws
maintainer.throwOnNextRun(new RuntimeException()).run();
- assertEquals(1, consecutiveFailures.get());
+ assertEquals(1, jobMetrics.consecutiveFailures.get());
// Maintainer recovers
maintainer.throwOnNextRun(null).run();
- assertEquals(0, consecutiveFailures.get());
+ assertEquals(0, jobMetrics.consecutiveFailures.get());
// Lock exception is treated as a failure
maintainer.throwOnNextRun(new UncheckedTimeoutException()).run();
- assertEquals(1, consecutiveFailures.get());
+ assertEquals(1, jobMetrics.consecutiveFailures.get());
+ }
+
+ private static class TestJobMetrics extends JobMetrics {
+
+ AtomicLong consecutiveFailures = new AtomicLong();
+
+ @Override
+ protected void recordCompletion(String job, Long incompleteRuns, double successFactor) {
+ consecutiveFailures.set(incompleteRuns);
+ }
+
}
}
diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java
index 44a00a37a83..7424b17cab2 100644
--- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java
+++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java
@@ -33,10 +33,10 @@ class TestMaintainer extends Maintainer {
}
@Override
- protected boolean maintain() {
+ protected double maintain() {
if (exceptionToThrow != null) throw exceptionToThrow;
totalRuns++;
- return success;
+ return success ? 1.0 : 0.0;
}
}