// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; import ai.vespa.metrics.ConfigServerMetrics; import ai.vespa.metrics.ControllerMetrics; import com.yahoo.component.Version; import com.yahoo.config.application.api.DeploymentInstanceSpec; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.HostName; import com.yahoo.config.provision.zone.ZoneId; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.athenz.client.zms.ZmsClient; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion; import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics; import com.yahoo.vespa.hosted.controller.auditlog.AuditLog; import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatusList; import com.yahoo.vespa.hosted.controller.deployment.JobList; import com.yahoo.vespa.hosted.controller.routing.rotation.RotationLock; import com.yahoo.vespa.hosted.controller.versions.NodeVersion; import com.yahoo.vespa.hosted.controller.versions.VersionStatus; import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.time.temporal.ChronoUnit; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import java.util.function.Function; import java.util.stream.Collectors; /** * This calculates and reports system-wide metrics based on data from a {@link Controller}. * * @author mortent * @author mpolden */ public class MetricsReporter extends ControllerMaintainer { public static final String TENANT_METRIC = ControllerMetrics.BILLING_TENANTS.baseName(); public static final String DEPLOYMENT_FAIL_METRIC = ControllerMetrics.DEPLOYMENT_FAILURE_PERCENTAGE.baseName(); public static final String DEPLOYMENT_AVERAGE_DURATION = ControllerMetrics.DEPLOYMENT_AVERAGE_DURATION.baseName(); public static final String DEPLOYMENT_FAILING_UPGRADES = ControllerMetrics.DEPLOYMENT_FAILING_UPGRADES.baseName(); public static final String DEPLOYMENT_BUILD_AGE_SECONDS = ControllerMetrics.DEPLOYMENT_BUILD_AGE_SECONDS.baseName(); public static final String DEPLOYMENT_WARNINGS = ControllerMetrics.DEPLOYMENT_WARNINGS.baseName(); public static final String DEPLOYMENT_OVERDUE_UPGRADE = ControllerMetrics.DEPLOYMENT_OVERDUE_UPGRADE_SECONDS.baseName(); public static final String OS_CHANGE_DURATION = ControllerMetrics.DEPLOYMENT_OS_CHANGE_DURATION.baseName(); public static final String PLATFORM_CHANGE_DURATION = ControllerMetrics.DEPLOYMENT_PLATFORM_CHANGE_DURATION.baseName(); public static final String OS_NODE_COUNT = ControllerMetrics.DEPLOYMENT_NODE_COUNT_BY_OS_VERSION.baseName(); public static final String PLATFORM_NODE_COUNT = ControllerMetrics.DEPLOYMENT_NODE_COUNT_BY_PLATFORM_VERSION.baseName(); public static final String BROKEN_SYSTEM_VERSION = ControllerMetrics.DEPLOYMENT_BROKEN_SYSTEM_VERSION.baseName(); public static final String REMAINING_ROTATIONS = ControllerMetrics.REMAINING_ROTATIONS.baseName(); public static final String NAME_SERVICE_REQUESTS_QUEUED = ControllerMetrics.DNS_QUEUED_REQUESTS.baseName(); public static final String OPERATION_PREFIX = "operation."; public static final String ZMS_QUOTA_USAGE = ControllerMetrics.ZMS_QUOTA_USAGE.baseName(); private final Metric metric; private final Clock clock; private final ZmsClient zmsClient; // Keep track of reported node counts for each version private final ConcurrentHashMap nodeCounts = new ConcurrentHashMap<>(); public MetricsReporter(Controller controller, Metric metric, ZmsClient zmsClient) { super(controller, Duration.ofMinutes(1)); // use fixed rate for metrics this.metric = metric; this.clock = controller.clock(); this.zmsClient = zmsClient; } @Override public double maintain() { reportDeploymentMetrics(); reportRemainingRotations(); reportQueuedNameServiceRequests(); VersionStatus versionStatus = controller().readVersionStatus(); reportInfrastructureUpgradeMetrics(versionStatus); reportAuditLog(); reportBrokenSystemVersion(versionStatus); reportTenantMetrics(); reportZmsQuotaMetrics(); return 0.0; } private void reportBrokenSystemVersion(VersionStatus versionStatus) { Version systemVersion = controller().systemVersion(versionStatus); VespaVersion.Confidence confidence = versionStatus.version(systemVersion).confidence(); int isBroken = confidence == VespaVersion.Confidence.broken ? 1 : 0; metric.set(BROKEN_SYSTEM_VERSION, isBroken, metric.createContext(Map.of())); } private void reportAuditLog() { AuditLog log = controller().auditLogger().readLog(); HashMap> metricCounts = new HashMap<>(); for (AuditLog.Entry entry : log.entries()) { String[] resource = entry.resource().split("/"); if((resource.length > 1) && (resource[1] != null)) { String api = resource[1]; String operationMetric = OPERATION_PREFIX + api; HashMap dimension = metricCounts.get(operationMetric); if (dimension != null) { Integer count = dimension.get(entry.principal()); if (count != null) { dimension.replace(entry.principal(), ++count); } else { dimension.put(entry.principal(), 1); } } else { dimension = new HashMap<>(); dimension.put(entry.principal(),1); metricCounts.put(operationMetric, dimension); } } } for (String operationMetric : metricCounts.keySet()) { for (String userDimension : metricCounts.get(operationMetric).keySet()) { metric.set(operationMetric, (metricCounts.get(operationMetric)).get(userDimension), metric.createContext(Map.of("operator", userDimension))); } } } private void reportInfrastructureUpgradeMetrics(VersionStatus versionStatus) { Map osChangeDurations = osChangeDurations(); Map platformChangeDurations = platformChangeDurations(versionStatus); reportChangeDurations(osChangeDurations, OS_CHANGE_DURATION); reportChangeDurations(platformChangeDurations, PLATFORM_CHANGE_DURATION); reportNodeCount(osChangeDurations.keySet(), OS_NODE_COUNT); reportNodeCount(platformChangeDurations.keySet(), PLATFORM_NODE_COUNT); } private void reportRemainingRotations() { try (RotationLock lock = controller().routing().rotations().lock()) { int availableRotations = controller().routing().rotations().availableRotations(lock).size(); metric.set(REMAINING_ROTATIONS, availableRotations, metric.createContext(Map.of())); } } private void reportDeploymentMetrics() { ApplicationList applications = ApplicationList.from(controller().applications().readable()) .withProductionDeployment(); DeploymentStatusList deployments = controller().jobController().deploymentStatuses(applications); metric.set(DEPLOYMENT_FAIL_METRIC, deploymentFailRatio(deployments) * 100, metric.createContext(Map.of())); averageDeploymentDurations(deployments, clock.instant()).forEach((instance, duration) -> { metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.toSeconds(), metric.createContext(dimensions(instance))); }); deploymentsFailingUpgrade(deployments).forEach((instance, failingJobs) -> { metric.set(DEPLOYMENT_FAILING_UPGRADES, failingJobs, metric.createContext(dimensions(instance))); }); deploymentWarnings(deployments).forEach((instance, warnings) -> { metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(instance))); }); overdueUpgradeDurationByInstance(deployments).forEach((instance, overduePeriod) -> { metric.set(DEPLOYMENT_OVERDUE_UPGRADE, overduePeriod.toSeconds(), metric.createContext(dimensions(instance))); }); for (Application application : applications.asList()) application.revisions().last() .flatMap(ApplicationVersion::buildTime) .ifPresent(buildTime -> metric.set(DEPLOYMENT_BUILD_AGE_SECONDS, controller().clock().instant().getEpochSecond() - buildTime.getEpochSecond(), metric.createContext(dimensions(application.id().defaultInstance())))); } private Map overdueUpgradeDurationByInstance(DeploymentStatusList deployments) { Instant now = clock.instant(); Map overdueUpgrades = new HashMap<>(); for (var deploymentStatus : deployments) { for (var kv : deploymentStatus.instanceJobs().entrySet()) { ApplicationId instance = kv.getKey(); JobList jobs = kv.getValue(); boolean upgradeRunning = !jobs.production().upgrading().isEmpty(); DeploymentInstanceSpec instanceSpec = deploymentStatus.application().deploymentSpec().requireInstance(instance.instance()); Duration overdueDuration = upgradeRunning ? overdueUpgradeDuration(now, instanceSpec) : Duration.ZERO; overdueUpgrades.put(instance, overdueDuration); } } return Collections.unmodifiableMap(overdueUpgrades); } /** Returns how long an upgrade has been running inside a block window */ static Duration overdueUpgradeDuration(Instant upgradingAt, DeploymentInstanceSpec instanceSpec) { Optional lastOpened = Optional.empty(); // When the upgrade window most recently opened Instant oneWeekAgo = upgradingAt.minus(Duration.ofDays(7)); Duration step = Duration.ofHours(1); for (Instant instant = upgradingAt.truncatedTo(ChronoUnit.HOURS); !instanceSpec.canUpgradeAt(instant); instant = instant.minus(step)) { if (!instant.isAfter(oneWeekAgo)) { // Wrapped around, the entire week is being blocked lastOpened = Optional.empty(); break; } lastOpened = Optional.of(instant); } if (lastOpened.isEmpty()) return Duration.ZERO; return Duration.between(lastOpened.get(), upgradingAt); } private void reportQueuedNameServiceRequests() { metric.set(NAME_SERVICE_REQUESTS_QUEUED, controller().curator().readNameServiceQueue().requests().size(), metric.createContext(Map.of())); } private void reportNodeCount(Set nodeVersions, String metricName) { Map newNodeCounts = nodeVersions.stream() .collect(Collectors.groupingBy(nodeVersion -> { return new NodeCountKey(metricName, nodeVersion.currentVersion(), nodeVersion.zone()); }, Collectors.counting())); nodeCounts.putAll(newNodeCounts); nodeCounts.forEach((key, count) -> { if (newNodeCounts.containsKey(key)) { // Version is still present: Update the metric. metric.set(metricName, count, metric.createContext(dimensions(key.zone, key.version))); } else if (key.metricName.equals(metricName)) { // Version is no longer present, but has been previously reported: Set it to zero. metric.set(metricName, 0, metric.createContext(dimensions(key.zone, key.version))); } }); } private void reportChangeDurations(Map changeDurations, String metricName) { changeDurations.forEach((nodeVersion, duration) -> { metric.set(metricName, duration.toSeconds(), metric.createContext(dimensions(nodeVersion.hostname(), nodeVersion.zone()))); }); } private void reportTenantMetrics() { if (! controller().system().isPublic()) return; var planCounter = new TreeMap(); controller().tenants().asList().forEach(tenant -> { var planId = controller().serviceRegistry().billingController().getPlan(tenant.name()); planCounter.merge(planId.value(), 1, Integer::sum); }); planCounter.forEach((planId, count) -> { var context = metric.createContext(Map.of("plan", planId)); metric.set(TENANT_METRIC, count, context); }); } private void reportZmsQuotaMetrics() { var quota = zmsClient.getQuotaUsage(); reportZmsQuota("subdomains", quota.getSubdomainUsage()); reportZmsQuota("services", quota.getServiceUsage()); reportZmsQuota("policies", quota.getPolicyUsage()); reportZmsQuota("roles", quota.getRoleUsage()); reportZmsQuota("groups", quota.getGroupUsage()); } private void reportZmsQuota(String resourceType, double usage) { var context = metric.createContext(Map.of("resourceType", resourceType)); metric.set(ZMS_QUOTA_USAGE, usage, context); } private Map platformChangeDurations(VersionStatus versionStatus) { return changeDurations(versionStatus.versions(), VespaVersion::nodeVersions); } private Map osChangeDurations() { return changeDurations(controller().os().status().versions().values(), Function.identity()); } private Map changeDurations(Collection versions, Function> versionsGetter) { var now = clock.instant(); var durations = new HashMap(); for (var version : versions) { for (var nodeVersion : versionsGetter.apply(version)) { durations.put(nodeVersion, nodeVersion.changeDuration(now)); } } return durations; } private static double deploymentFailRatio(DeploymentStatusList statuses) { return statuses.asList().stream() .mapToInt(status -> status.hasFailures() ? 1 : 0) .average().orElse(0); } private static Map averageDeploymentDurations(DeploymentStatusList statuses, Instant now) { return statuses.asList().stream() .flatMap(status -> status.instanceJobs().entrySet().stream()) .collect(Collectors.toUnmodifiableMap(entry -> entry.getKey(), entry -> averageDeploymentDuration(entry.getValue(), now))); } private static Map deploymentsFailingUpgrade(DeploymentStatusList statuses) { return statuses.asList().stream() .flatMap(status -> status.instanceJobs().entrySet().stream()) .collect(Collectors.toUnmodifiableMap(entry -> entry.getKey(), entry -> deploymentsFailingUpgrade(entry.getValue()))); } private static int deploymentsFailingUpgrade(JobList jobs) { return jobs.failingHard().not().failingApplicationChange().size(); } private static Duration averageDeploymentDuration(JobList jobs, Instant now) { List jobDurations = jobs.lastTriggered() .mapToList(run -> Duration.between(run.start(), run.end().orElse(now))); return jobDurations.stream() .reduce(Duration::plus) .map(totalDuration -> totalDuration.dividedBy(jobDurations.size())) .orElse(Duration.ZERO); } private static Map deploymentWarnings(DeploymentStatusList statuses) { return statuses.asList().stream() .flatMap(status -> status.application().instances().values().stream()) .collect(Collectors.toMap(Instance::id, a -> maxWarningCountOf(a.deployments().values()))); } private static int maxWarningCountOf(Collection deployments) { return deployments.stream() .map(Deployment::metrics) .map(DeploymentMetrics::warnings) .map(Map::values) .flatMap(Collection::stream) .max(Integer::compareTo) .orElse(0); } private static Map dimensions(ApplicationId application) { return Map.of("tenantName", application.tenant().value(), "app", application.application().value() + "." + application.instance().value(), "applicationId", application.toFullString()); } private static Map dimensions(HostName hostname, ZoneId zone) { return Map.of("host", hostname.value(), "zone", zone.value()); } private static Map dimensions(ZoneId zone, Version currentVersion) { return Map.of("zone", zone.value(), "currentVersion", currentVersion.toFullString()); } private static class NodeCountKey { private final String metricName; private final Version version; private final ZoneId zone; public NodeCountKey(String metricName, Version version, ZoneId zone) { this.metricName = metricName; this.version = version; this.zone = zone; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; NodeCountKey that = (NodeCountKey) o; return metricName.equals(that.metricName) && version.equals(that.version) && zone.equals(that.zone); } @Override public int hashCode() { return Objects.hash(metricName, version, zone); } } }