diff options
author | Martin Polden <mpolden@mpolden.no> | 2021-01-07 10:49:45 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2021-01-07 10:55:03 +0100 |
commit | 7471c4fd27a6c5eb764e0eaff78f7aaa2f8d4403 (patch) | |
tree | 716ed034e090a9b4c811f48f0d4445d9b710fd05 /node-repository | |
parent | 20f52d23421cd25f1e2e27a17c5e404c55a86ff1 (diff) |
Report exclusive switch metric per cluster
Diffstat (limited to 'node-repository')
4 files changed, 112 insertions, 57 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index 5c635551692..b0b61e8a6b2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -12,8 +12,10 @@ import com.yahoo.config.provision.NodeType; import java.util.Comparator; import java.util.EnumSet; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -213,6 +215,28 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { first().get().resources()); } + /** Returns the nodes that are allocated on an exclusive network switch within its cluster */ + public NodeList onExclusiveSwitch(NodeList clusterHosts) { + ensureSingleCluster(); + Map<String, Long> switchCount = clusterHosts.stream() + .flatMap(host -> host.switchHostname().stream()) + .collect(Collectors.groupingBy(Function.identity(), + Collectors.counting())); + return matching(node -> { + Optional<Node> nodeOnSwitch = clusterHosts.parentOf(node); + if (node.parentHostname().isPresent()) { + if (nodeOnSwitch.isEmpty()) { + throw new IllegalArgumentException("Parent of " + node + ", " + node.parentHostname().get() + + ", not found in given cluster hosts"); + } + } else { + nodeOnSwitch = Optional.of(node); + } + Optional<String> allocatedSwitch = nodeOnSwitch.flatMap(Node::switchHostname); + return allocatedSwitch.isEmpty() || switchCount.get(allocatedSwitch.get()) == 1; + }); + } + private void ensureSingleCluster() { if (isEmpty()) return; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 4a5c28fe0c8..778a3656dca 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -79,6 +79,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { updateTenantUsageMetrics(nodes); updateRepairTicketMetrics(nodes); updateAllocationMetrics(nodes); + updateExclusiveSwitchMetrics(nodes); return true; } @@ -102,11 +103,24 @@ public class MetricsReporter extends NodeRepositoryMaintainer { } else { nonActiveFraction = (double) nonActiveNodes / (double) activeNodes; } - Map<String, String> dimensions = new HashMap<>(dimensions(clusterKey.application)); - dimensions.put("clusterId", clusterKey.cluster.value()); - metric.set("nodes.active", activeNodes, getContext(dimensions)); - metric.set("nodes.nonActive", nonActiveNodes, getContext(dimensions)); - metric.set("nodes.nonActiveFraction", nonActiveFraction, getContext(dimensions)); + Metric.Context context = getContext(dimensions(clusterKey.application, clusterKey.cluster)); + metric.set("nodes.active", activeNodes, context); + metric.set("nodes.nonActive", nonActiveNodes, context); + metric.set("nodes.nonActiveFraction", nonActiveFraction, context); + }); + } + + private void updateExclusiveSwitchMetrics(NodeList nodes) { + Map<ClusterKey, List<Node>> byCluster = nodes.stream() + .filter(node -> node.type() == NodeType.tenant) + .filter(node -> node.state() == State.active) + .filter(node -> node.allocation().isPresent()) + .collect(Collectors.groupingBy(node -> new ClusterKey(node.allocation().get().owner(), node.allocation().get().membership().cluster().id()))); + byCluster.forEach((clusterKey, clusterNodes) -> { + NodeList clusterHosts = nodes.parentsOf(NodeList.copyOf(clusterNodes)); + long nodesOnExclusiveSwitch = NodeList.copyOf(clusterNodes).onExclusiveSwitch(clusterHosts).size(); + double exclusiveSwitchRatio = nodesOnExclusiveSwitch / (double) clusterNodes.size(); + metric.set("nodes.exclusiveSwitchFraction", exclusiveSwitchRatio, getContext(dimensions(clusterKey.application, clusterKey.cluster))); }); } @@ -340,6 +354,12 @@ public class MetricsReporter extends NodeRepositoryMaintainer { .forEach((status, number) -> metric.set("hostedVespa.breakfixedHosts", number, getContext(Map.of("status", status)))); } + static Map<String, String> dimensions(ApplicationId application, ClusterSpec.Id cluster) { + Map<String, String> dimensions = new HashMap<>(dimensions(application)); + dimensions.put("clusterId", cluster.value()); + return dimensions; + } + private static Map<String, String> dimensions(ApplicationId application) { return Map.of("tenantName", application.tenant().value(), "applicationId", application.serializedForm().replace(':', '.'), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java index ee02beb168f..e545b3d97ee 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java @@ -13,7 +13,7 @@ import com.yahoo.vespa.hosted.provision.node.Agent; import java.time.Duration; import java.util.HashSet; -import java.util.Optional; +import java.util.List; import java.util.Set; /** @@ -67,17 +67,12 @@ public class SwitchRebalancer extends NodeMover<Move> { } /** Returns whether allocatedNode is on an exclusive switch */ - private boolean onExclusiveSwitch(Node allocatedNode, NodeList clusterHosts) { - Optional<String> allocatedSwitch = clusterHosts.parentOf(allocatedNode).flatMap(Node::switchHostname); - if (allocatedSwitch.isEmpty()) return true; - return clusterHosts.stream() - .flatMap(host -> host.switchHostname().stream()) - .filter(switchHostname -> switchHostname.equals(allocatedSwitch.get())) - .count() == 1; + private static boolean onExclusiveSwitch(Node allocatedNode, NodeList clusterHosts) { + return !NodeList.copyOf(List.of(allocatedNode)).onExclusiveSwitch(clusterHosts).isEmpty(); } /** Returns whether allocating a node on toHost would increase the number of exclusive switches */ - private boolean increasesExclusiveSwitches(NodeList clusterNodes, NodeList clusterHosts, Node toHost) { + private static boolean increasesExclusiveSwitches(NodeList clusterNodes, NodeList clusterHosts, Node toHost) { if (toHost.switchHostname().isEmpty()) return false; Set<String> activeSwitches = new HashSet<>(); int unknownSwitches = 0; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index 3e4887b6998..0a4ba497558 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -6,20 +6,16 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Capacity; import com.yahoo.config.provision.ClusterMembership; import com.yahoo.config.provision.ClusterResources; -import com.yahoo.config.provision.DockerImage; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; -import com.yahoo.config.provision.Zone; import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; import com.yahoo.transaction.NestedTransaction; import com.yahoo.vespa.applicationmodel.ApplicationInstance; import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; -import com.yahoo.vespa.curator.Curator; -import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.curator.stats.LockStats; -import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -27,10 +23,8 @@ import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.Generation; import com.yahoo.vespa.hosted.provision.node.IP; -import com.yahoo.vespa.hosted.provision.provisioning.EmptyProvisionServiceProvider; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; -import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.orchestrator.status.HostInfo; import com.yahoo.vespa.orchestrator.status.HostStatus; @@ -39,7 +33,6 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor; import org.junit.Before; import org.junit.Test; -import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.List; @@ -85,7 +78,10 @@ public class MetricsReporterTest { @Test public void test_registered_metric() { NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default"); - ProvisioningTester tester = new ProvisioningTester.Builder().flavors(nodeFlavors.getFlavors()).build(); + Orchestrator orchestrator = mock(Orchestrator.class); + when(orchestrator.getHostInfo(eq(reference), any())).thenReturn( + HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, Instant.ofEpochSecond(1))); + ProvisioningTester tester = new ProvisioningTester.Builder().flavors(nodeFlavors.getFlavors()).orchestrator(orchestrator).build(); NodeRepository nodeRepository = tester.nodeRepository(); tester.makeProvisionedNodes(1, "default", NodeType.tenant, 0); tester.makeProvisionedNodes(1, "default", NodeType.proxy, 0); @@ -132,17 +128,8 @@ public class MetricsReporterTest { tester.clock().setInstant(Instant.ofEpochSecond(124)); - Orchestrator orchestrator = mock(Orchestrator.class); - when(orchestrator.getHostInfo(eq(reference), any())).thenReturn( - HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, Instant.ofEpochSecond(1))); - TestMetric metric = new TestMetric(); - MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, - metric, - orchestrator, - serviceMonitor, - () -> 42, - LONG_INTERVAL); + MetricsReporter metricsReporter = metricsReporter(metric, tester); metricsReporter.maintain(); // Verify sum of values across dimensions, and remove these metrics to avoid checking against @@ -167,17 +154,8 @@ public class MetricsReporterTest { @Test public void docker_metrics() { NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("host", "docker", "docker2"); - Curator curator = new MockCurator(); - NodeRepository nodeRepository = new NodeRepository(nodeFlavors, - new EmptyProvisionServiceProvider(), - curator, - Clock.systemUTC(), - Zone.defaultZone(), - new MockNameResolver().mockAnyLookup(), - DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"), - new InMemoryFlagSource(), - true, - 0, 1000); + ProvisioningTester tester = new ProvisioningTester.Builder().flavors(nodeFlavors.getFlavors()).build(); + NodeRepository nodeRepository = tester.nodeRepository(); // Allow 4 containers Set<String> ipAddressPool = Set.of("::2", "::3", "::4", "::5"); @@ -210,12 +188,7 @@ public class MetricsReporterTest { when(orchestrator.getHostInfo(eq(reference), any())).thenReturn(HostInfo.createNoRemarks()); TestMetric metric = new TestMetric(); - MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, - metric, - orchestrator, - serviceMonitor, - () -> 42, - LONG_INTERVAL); + MetricsReporter metricsReporter = metricsReporter(metric, tester); metricsReporter.maintain(); assertEquals(0, metric.values.get("hostedVespa.readyHosts")); // Only tenants counts @@ -246,13 +219,7 @@ public class MetricsReporterTest { tester.makeReadyHosts(5, new NodeResources(64, 256, 2000, 10)); tester.activateTenantHosts(); TestMetric metric = new TestMetric(); - MetricsReporter metricsReporter = new MetricsReporter(tester.nodeRepository(), - metric, - tester.orchestrator(), - serviceMonitor, - () -> 42, - LONG_INTERVAL); - + MetricsReporter metricsReporter = metricsReporter(metric, tester); // Application is deployed ApplicationId application = ApplicationId.from("t1", "a1", "default"); @@ -279,6 +246,46 @@ public class MetricsReporterTest { assertEquals(3, getMetric("nodes.nonActive", metric, dimensions)); } + @Test + public void exclusive_switch_ratio() { + ProvisioningTester tester = new ProvisioningTester.Builder().build(); + ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("c1")).vespaVersion("1").build(); + Capacity capacity = Capacity.from(new ClusterResources(4, 1, new NodeResources(4, 8, 50, 1))); + ApplicationId app = ApplicationId.from("t1", "a1", "default"); + TestMetric metric = new TestMetric(); + MetricsReporter metricsReporter = metricsReporter(metric, tester); + + // Provision initial hosts on two switches + NodeResources hostResources = new NodeResources(8, 16, 500, 10); + List<Node> hosts0 = tester.makeReadyNodes(4, hostResources, NodeType.host, 5); + tester.activateTenantHosts(); + String switch0 = "switch0"; + String switch1 = "switch1"; + tester.patchNode(hosts0.get(0), (host) -> host.withSwitchHostname(switch0)); + tester.patchNodes(hosts0.subList(1, hosts0.size()), (host) -> host.withSwitchHostname(switch1)); + + // Deploy application + tester.deploy(app, spec, capacity); + tester.assertSwitches(Set.of(switch0, switch1), app, spec.id()); + metricsReporter.maintain(); + assertEquals(0.25D, getMetric("nodes.exclusiveSwitchFraction", metric, MetricsReporter.dimensions(app, spec.id())).doubleValue(), Double.MIN_VALUE); + + // More exclusive switches become available + List<Node> hosts1 = tester.makeReadyNodes(2, hostResources, NodeType.host, 5); + tester.activateTenantHosts(); + String switch2 = "switch2"; + String switch3 = "switch3"; + tester.patchNode(hosts1.get(0), (host) -> host.withSwitchHostname(switch2)); + tester.patchNode(hosts1.get(1), (host) -> host.withSwitchHostname(switch3)); + + // Another cluster is added + ClusterSpec spec2 = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("c2")).vespaVersion("1").build(); + tester.deploy(app, spec2, capacity); + tester.assertSwitches(Set.of(switch0, switch1, switch2, switch3), app, spec2.id()); + metricsReporter.maintain(); + assertEquals(1D, getMetric("nodes.exclusiveSwitchFraction", metric, MetricsReporter.dimensions(app, spec2.id())).doubleValue(), Double.MIN_VALUE); + } + private Number getMetric(String name, TestMetric metric, Map<String, String> dimensions) { List<TestMetric.TestContext> metrics = metric.context.get(name).stream() .filter(ctx -> ctx.properties.entrySet().containsAll(dimensions.entrySet())) @@ -306,4 +313,13 @@ public class MetricsReporterTest { return Optional.empty(); } + private MetricsReporter metricsReporter(TestMetric metric, ProvisioningTester tester) { + return new MetricsReporter(tester.nodeRepository(), + metric, + tester.orchestrator(), + serviceMonitor, + () -> 42, + LONG_INTERVAL); + } + } |