diff options
Diffstat (limited to 'node-repository/src')
6 files changed, 50 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 3ac5b496f56..1601b2e3205 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -51,10 +51,6 @@ public class MetricsReporter extends Maintainer { } private void updateNodeMetrics(Node node) { - // Dimensions automatically added: host, vespaVersion, zone, role, and colo. - // 'vespaVersion' is the vespaVersion for the config server and not related - // to the node we're making metric for now. - Metric.Context context; Optional<Allocation> allocation = node.allocation(); @@ -65,12 +61,13 @@ public class MetricsReporter extends Maintainer { "hostname", node.hostname(), "tenantName", applicationId.tenant().value(), "applicationId", applicationId.serializedForm().replace(':', '.'), + "app", toApp(applicationId), "clustertype", allocation.get().membership().cluster().type().name(), "clusterid", allocation.get().membership().cluster().id().value()); long wantedRestartGeneration = allocation.get().restartGeneration().wanted(); metric.set("wantedRestartGeneration", wantedRestartGeneration, context); - long currentRestartGeneration = allocation.get().restartGeneration().wanted(); + long currentRestartGeneration = allocation.get().restartGeneration().current(); metric.set("currentRestartGeneration", currentRestartGeneration, context); boolean wantToRestart = currentRestartGeneration < wantedRestartGeneration; metric.set("wantToRestart", wantToRestart ? 1 : 0, context); @@ -105,6 +102,12 @@ public class MetricsReporter extends Maintainer { metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context); metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context); + metric.set("hardwareFailure", + node.status().hardwareFailureDescription().isPresent() ? 1 : 0, + context); + metric.set("hardwareDivergence", + node.status().hardwareDivergence().isPresent() ? 1 : 0, + context); try { HostStatus status = orchestrator.getNodeStatus(new HostName(node.hostname())); @@ -117,6 +120,10 @@ public class MetricsReporter extends Maintainer { // TODO: Also add metric on whether some services are down on node? } + private static String toApp(ApplicationId applicationId) { + return applicationId.application().value() + "." + applicationId.instance().value(); + } + /** * A version 6.163.20 will be returned as a number 163.020. The major * version can normally be inferred. As long as the micro version stays diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index d90b558a6eb..1c81d97ddea 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -10,6 +10,7 @@ import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.ApplicationInstance; import com.yahoo.vespa.applicationmodel.ServiceCluster; import com.yahoo.vespa.applicationmodel.ServiceInstance; +import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; @@ -18,7 +19,6 @@ import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; import com.yahoo.vespa.service.monitor.ServiceMonitor; -import com.yahoo.vespa.service.monitor.ServiceMonitorStatus; import java.time.Clock; import java.time.Duration; @@ -187,15 +187,15 @@ public class NodeFailer extends Maintainer { */ private List<Node> determineActiveNodeDownStatus() { List<Node> downNodes = new ArrayList<>(); - for (ApplicationInstance<ServiceMonitorStatus> application : serviceMonitor.queryStatusOfAllApplicationInstances().values()) { - for (ServiceCluster<ServiceMonitorStatus> cluster : application.serviceClusters()) { - for (ServiceInstance<ServiceMonitorStatus> service : cluster.serviceInstances()) { + for (ApplicationInstance application : serviceMonitor.getAllApplicationInstances().values()) { + for (ServiceCluster cluster : application.serviceClusters()) { + for (ServiceInstance service : cluster.serviceInstances()) { Optional<Node> node = nodeRepository().getNode(service.hostName().s(), Node.State.active); if ( ! node.isPresent()) continue; // we also get status from infrastructure nodes, which are not in the repo. TODO: remove when proxy nodes are in node repo everywhere - if (service.serviceStatus().equals(ServiceMonitorStatus.DOWN)) + if (service.serviceStatus().equals(ServiceStatus.DOWN)) downNodes.add(recordAsDown(node.get())); - else if (service.serviceStatus().equals(ServiceMonitorStatus.UP)) + else if (service.serviceStatus().equals(ServiceStatus.UP)) clearDownRecord(node.get()); // else: we don't know current status; don't take any action until we have positive information } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/package-info.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/package-info.java new file mode 100644 index 00000000000..5d0a3cc6093 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/package-info.java @@ -0,0 +1,8 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author bjorncs + */ +@ExportPackage +package com.yahoo.vespa.hosted.provision.node; + +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java index 19882f0a508..46d72974718 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java @@ -7,6 +7,7 @@ import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException; import com.yahoo.vespa.orchestrator.BatchHostNameNotFoundException; import com.yahoo.vespa.orchestrator.BatchInternalErrorException; +import com.yahoo.vespa.orchestrator.Host; import com.yahoo.vespa.orchestrator.HostNameNotFoundException; import com.yahoo.vespa.orchestrator.OrchestrationException; import com.yahoo.vespa.orchestrator.Orchestrator; @@ -29,6 +30,11 @@ public class OrchestratorMock implements Orchestrator { Set<ApplicationId> suspendedApplications = new HashSet<>(); @Override + public Host getHost(HostName hostName) throws HostNameNotFoundException { + return null; + } + + @Override public HostStatus getNodeStatus(HostName hostName) throws HostNameNotFoundException { return null; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java index e38802234bf..56e5fcafbde 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java @@ -11,12 +11,12 @@ import com.yahoo.vespa.applicationmodel.ConfigId; import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.applicationmodel.ServiceCluster; import com.yahoo.vespa.applicationmodel.ServiceInstance; +import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.service.monitor.ServiceMonitor; -import com.yahoo.vespa.service.monitor.ServiceMonitorStatus; import java.util.Collections; import java.util.HashMap; @@ -60,31 +60,31 @@ public class ServiceMonitorStub implements ServiceMonitor { this.statusIsKnown = statusIsKnown; } - private ServiceMonitorStatus getHostStatus(String hostname) { - if (!statusIsKnown) return ServiceMonitorStatus.NOT_CHECKED; - if (downHosts.contains(hostname)) return ServiceMonitorStatus.DOWN; - return ServiceMonitorStatus.UP; + private ServiceStatus getHostStatus(String hostname) { + if (!statusIsKnown) return ServiceStatus.NOT_CHECKED; + if (downHosts.contains(hostname)) return ServiceStatus.DOWN; + return ServiceStatus.UP; } @Override - public Map<ApplicationInstanceReference, ApplicationInstance<ServiceMonitorStatus>> queryStatusOfAllApplicationInstances() { + public Map<ApplicationInstanceReference, ApplicationInstance> getAllApplicationInstances() { // Convert apps information to the response payload to return - Map<ApplicationInstanceReference, ApplicationInstance<ServiceMonitorStatus>> status = new HashMap<>(); + Map<ApplicationInstanceReference, ApplicationInstance> status = new HashMap<>(); for (Map.Entry<ApplicationId, MockDeployer.ApplicationContext> app : apps.entrySet()) { - Set<ServiceInstance<ServiceMonitorStatus>> serviceInstances = new HashSet<>(); + Set<ServiceInstance> serviceInstances = new HashSet<>(); for (Node node : nodeRepository.getNodes(app.getValue().id(), Node.State.active)) { - serviceInstances.add(new ServiceInstance<>(new ConfigId("configid"), - new HostName(node.hostname()), - getHostStatus(node.hostname()))); + serviceInstances.add(new ServiceInstance(new ConfigId("configid"), + new HostName(node.hostname()), + getHostStatus(node.hostname()))); } - Set<ServiceCluster<ServiceMonitorStatus>> serviceClusters = new HashSet<>(); - serviceClusters.add(new ServiceCluster<>(new ClusterId(app.getValue().clusterContexts().get(0).cluster().id().value()), - new ServiceType("serviceType"), - serviceInstances)); + Set<ServiceCluster> serviceClusters = new HashSet<>(); + serviceClusters.add(new ServiceCluster(new ClusterId(app.getValue().clusterContexts().get(0).cluster().id().value()), + new ServiceType("serviceType"), + serviceInstances)); TenantId tenantId = new TenantId(app.getKey().tenant().value()); ApplicationInstanceId applicationInstanceId = new ApplicationInstanceId(app.getKey().application().value()); status.put(new ApplicationInstanceReference(tenantId, applicationInstanceId), - new ApplicationInstance<>(tenantId, applicationInstanceId, serviceClusters)); + new ApplicationInstance(tenantId, applicationInstanceId, serviceClusters)); } return status; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java index b40e1f4923c..a3697e57482 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java @@ -78,6 +78,8 @@ public class MetricsReporterTest { expectedMetrics.put("wantToReboot", 0); expectedMetrics.put("wantToRetire", 0); expectedMetrics.put("wantToDeprovision", 0); + expectedMetrics.put("hardwareFailure", 0); + expectedMetrics.put("hardwareDivergence", 0); expectedMetrics.put("allowedToBeDown", 0); Orchestrator orchestrator = mock(Orchestrator.class); |