summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2021-01-07 10:49:45 +0100
committerMartin Polden <mpolden@mpolden.no>2021-01-07 10:55:03 +0100
commit7471c4fd27a6c5eb764e0eaff78f7aaa2f8d4403 (patch)
tree716ed034e090a9b4c811f48f0d4445d9b710fd05 /node-repository
parent20f52d23421cd25f1e2e27a17c5e404c55a86ff1 (diff)
Report exclusive switch metric per cluster
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java24
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java30
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java13
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java102
4 files changed, 112 insertions, 57 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index 5c635551692..b0b61e8a6b2 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -12,8 +12,10 @@ import com.yahoo.config.provision.NodeType;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.List;
+import java.util.Map;
import java.util.Optional;
import java.util.Set;
+import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -213,6 +215,28 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
first().get().resources());
}
+ /** Returns the nodes that are allocated on an exclusive network switch within its cluster */
+ public NodeList onExclusiveSwitch(NodeList clusterHosts) {
+ ensureSingleCluster();
+ Map<String, Long> switchCount = clusterHosts.stream()
+ .flatMap(host -> host.switchHostname().stream())
+ .collect(Collectors.groupingBy(Function.identity(),
+ Collectors.counting()));
+ return matching(node -> {
+ Optional<Node> nodeOnSwitch = clusterHosts.parentOf(node);
+ if (node.parentHostname().isPresent()) {
+ if (nodeOnSwitch.isEmpty()) {
+ throw new IllegalArgumentException("Parent of " + node + ", " + node.parentHostname().get() +
+ ", not found in given cluster hosts");
+ }
+ } else {
+ nodeOnSwitch = Optional.of(node);
+ }
+ Optional<String> allocatedSwitch = nodeOnSwitch.flatMap(Node::switchHostname);
+ return allocatedSwitch.isEmpty() || switchCount.get(allocatedSwitch.get()) == 1;
+ });
+ }
+
private void ensureSingleCluster() {
if (isEmpty()) return;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 4a5c28fe0c8..778a3656dca 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -79,6 +79,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
updateTenantUsageMetrics(nodes);
updateRepairTicketMetrics(nodes);
updateAllocationMetrics(nodes);
+ updateExclusiveSwitchMetrics(nodes);
return true;
}
@@ -102,11 +103,24 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
} else {
nonActiveFraction = (double) nonActiveNodes / (double) activeNodes;
}
- Map<String, String> dimensions = new HashMap<>(dimensions(clusterKey.application));
- dimensions.put("clusterId", clusterKey.cluster.value());
- metric.set("nodes.active", activeNodes, getContext(dimensions));
- metric.set("nodes.nonActive", nonActiveNodes, getContext(dimensions));
- metric.set("nodes.nonActiveFraction", nonActiveFraction, getContext(dimensions));
+ Metric.Context context = getContext(dimensions(clusterKey.application, clusterKey.cluster));
+ metric.set("nodes.active", activeNodes, context);
+ metric.set("nodes.nonActive", nonActiveNodes, context);
+ metric.set("nodes.nonActiveFraction", nonActiveFraction, context);
+ });
+ }
+
+ private void updateExclusiveSwitchMetrics(NodeList nodes) {
+ Map<ClusterKey, List<Node>> byCluster = nodes.stream()
+ .filter(node -> node.type() == NodeType.tenant)
+ .filter(node -> node.state() == State.active)
+ .filter(node -> node.allocation().isPresent())
+ .collect(Collectors.groupingBy(node -> new ClusterKey(node.allocation().get().owner(), node.allocation().get().membership().cluster().id())));
+ byCluster.forEach((clusterKey, clusterNodes) -> {
+ NodeList clusterHosts = nodes.parentsOf(NodeList.copyOf(clusterNodes));
+ long nodesOnExclusiveSwitch = NodeList.copyOf(clusterNodes).onExclusiveSwitch(clusterHosts).size();
+ double exclusiveSwitchRatio = nodesOnExclusiveSwitch / (double) clusterNodes.size();
+ metric.set("nodes.exclusiveSwitchFraction", exclusiveSwitchRatio, getContext(dimensions(clusterKey.application, clusterKey.cluster)));
});
}
@@ -340,6 +354,12 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
.forEach((status, number) -> metric.set("hostedVespa.breakfixedHosts", number, getContext(Map.of("status", status))));
}
+ static Map<String, String> dimensions(ApplicationId application, ClusterSpec.Id cluster) {
+ Map<String, String> dimensions = new HashMap<>(dimensions(application));
+ dimensions.put("clusterId", cluster.value());
+ return dimensions;
+ }
+
private static Map<String, String> dimensions(ApplicationId application) {
return Map.of("tenantName", application.tenant().value(),
"applicationId", application.serializedForm().replace(':', '.'),
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
index ee02beb168f..e545b3d97ee 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
@@ -13,7 +13,7 @@ import com.yahoo.vespa.hosted.provision.node.Agent;
import java.time.Duration;
import java.util.HashSet;
-import java.util.Optional;
+import java.util.List;
import java.util.Set;
/**
@@ -67,17 +67,12 @@ public class SwitchRebalancer extends NodeMover<Move> {
}
/** Returns whether allocatedNode is on an exclusive switch */
- private boolean onExclusiveSwitch(Node allocatedNode, NodeList clusterHosts) {
- Optional<String> allocatedSwitch = clusterHosts.parentOf(allocatedNode).flatMap(Node::switchHostname);
- if (allocatedSwitch.isEmpty()) return true;
- return clusterHosts.stream()
- .flatMap(host -> host.switchHostname().stream())
- .filter(switchHostname -> switchHostname.equals(allocatedSwitch.get()))
- .count() == 1;
+ private static boolean onExclusiveSwitch(Node allocatedNode, NodeList clusterHosts) {
+ return !NodeList.copyOf(List.of(allocatedNode)).onExclusiveSwitch(clusterHosts).isEmpty();
}
/** Returns whether allocating a node on toHost would increase the number of exclusive switches */
- private boolean increasesExclusiveSwitches(NodeList clusterNodes, NodeList clusterHosts, Node toHost) {
+ private static boolean increasesExclusiveSwitches(NodeList clusterNodes, NodeList clusterHosts, Node toHost) {
if (toHost.switchHostname().isEmpty()) return false;
Set<String> activeSwitches = new HashSet<>();
int unknownSwitches = 0;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
index 3e4887b6998..0a4ba497558 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
@@ -6,20 +6,16 @@ import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Capacity;
import com.yahoo.config.provision.ClusterMembership;
import com.yahoo.config.provision.ClusterResources;
-import com.yahoo.config.provision.DockerImage;
+import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeFlavors;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
-import com.yahoo.config.provision.Zone;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
import com.yahoo.transaction.NestedTransaction;
import com.yahoo.vespa.applicationmodel.ApplicationInstance;
import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference;
-import com.yahoo.vespa.curator.Curator;
-import com.yahoo.vespa.curator.mock.MockCurator;
import com.yahoo.vespa.curator.stats.LockStats;
-import com.yahoo.vespa.flags.InMemoryFlagSource;
import com.yahoo.vespa.hosted.provision.LockedNodeList;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
@@ -27,10 +23,8 @@ import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.Generation;
import com.yahoo.vespa.hosted.provision.node.IP;
-import com.yahoo.vespa.hosted.provision.provisioning.EmptyProvisionServiceProvider;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
-import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver;
import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.orchestrator.status.HostInfo;
import com.yahoo.vespa.orchestrator.status.HostStatus;
@@ -39,7 +33,6 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor;
import org.junit.Before;
import org.junit.Test;
-import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.List;
@@ -85,7 +78,10 @@ public class MetricsReporterTest {
@Test
public void test_registered_metric() {
NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default");
- ProvisioningTester tester = new ProvisioningTester.Builder().flavors(nodeFlavors.getFlavors()).build();
+ Orchestrator orchestrator = mock(Orchestrator.class);
+ when(orchestrator.getHostInfo(eq(reference), any())).thenReturn(
+ HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, Instant.ofEpochSecond(1)));
+ ProvisioningTester tester = new ProvisioningTester.Builder().flavors(nodeFlavors.getFlavors()).orchestrator(orchestrator).build();
NodeRepository nodeRepository = tester.nodeRepository();
tester.makeProvisionedNodes(1, "default", NodeType.tenant, 0);
tester.makeProvisionedNodes(1, "default", NodeType.proxy, 0);
@@ -132,17 +128,8 @@ public class MetricsReporterTest {
tester.clock().setInstant(Instant.ofEpochSecond(124));
- Orchestrator orchestrator = mock(Orchestrator.class);
- when(orchestrator.getHostInfo(eq(reference), any())).thenReturn(
- HostInfo.createSuspended(HostStatus.ALLOWED_TO_BE_DOWN, Instant.ofEpochSecond(1)));
-
TestMetric metric = new TestMetric();
- MetricsReporter metricsReporter = new MetricsReporter(nodeRepository,
- metric,
- orchestrator,
- serviceMonitor,
- () -> 42,
- LONG_INTERVAL);
+ MetricsReporter metricsReporter = metricsReporter(metric, tester);
metricsReporter.maintain();
// Verify sum of values across dimensions, and remove these metrics to avoid checking against
@@ -167,17 +154,8 @@ public class MetricsReporterTest {
@Test
public void docker_metrics() {
NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("host", "docker", "docker2");
- Curator curator = new MockCurator();
- NodeRepository nodeRepository = new NodeRepository(nodeFlavors,
- new EmptyProvisionServiceProvider(),
- curator,
- Clock.systemUTC(),
- Zone.defaultZone(),
- new MockNameResolver().mockAnyLookup(),
- DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"),
- new InMemoryFlagSource(),
- true,
- 0, 1000);
+ ProvisioningTester tester = new ProvisioningTester.Builder().flavors(nodeFlavors.getFlavors()).build();
+ NodeRepository nodeRepository = tester.nodeRepository();
// Allow 4 containers
Set<String> ipAddressPool = Set.of("::2", "::3", "::4", "::5");
@@ -210,12 +188,7 @@ public class MetricsReporterTest {
when(orchestrator.getHostInfo(eq(reference), any())).thenReturn(HostInfo.createNoRemarks());
TestMetric metric = new TestMetric();
- MetricsReporter metricsReporter = new MetricsReporter(nodeRepository,
- metric,
- orchestrator,
- serviceMonitor,
- () -> 42,
- LONG_INTERVAL);
+ MetricsReporter metricsReporter = metricsReporter(metric, tester);
metricsReporter.maintain();
assertEquals(0, metric.values.get("hostedVespa.readyHosts")); // Only tenants counts
@@ -246,13 +219,7 @@ public class MetricsReporterTest {
tester.makeReadyHosts(5, new NodeResources(64, 256, 2000, 10));
tester.activateTenantHosts();
TestMetric metric = new TestMetric();
- MetricsReporter metricsReporter = new MetricsReporter(tester.nodeRepository(),
- metric,
- tester.orchestrator(),
- serviceMonitor,
- () -> 42,
- LONG_INTERVAL);
-
+ MetricsReporter metricsReporter = metricsReporter(metric, tester);
// Application is deployed
ApplicationId application = ApplicationId.from("t1", "a1", "default");
@@ -279,6 +246,46 @@ public class MetricsReporterTest {
assertEquals(3, getMetric("nodes.nonActive", metric, dimensions));
}
+ @Test
+ public void exclusive_switch_ratio() {
+ ProvisioningTester tester = new ProvisioningTester.Builder().build();
+ ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("c1")).vespaVersion("1").build();
+ Capacity capacity = Capacity.from(new ClusterResources(4, 1, new NodeResources(4, 8, 50, 1)));
+ ApplicationId app = ApplicationId.from("t1", "a1", "default");
+ TestMetric metric = new TestMetric();
+ MetricsReporter metricsReporter = metricsReporter(metric, tester);
+
+ // Provision initial hosts on two switches
+ NodeResources hostResources = new NodeResources(8, 16, 500, 10);
+ List<Node> hosts0 = tester.makeReadyNodes(4, hostResources, NodeType.host, 5);
+ tester.activateTenantHosts();
+ String switch0 = "switch0";
+ String switch1 = "switch1";
+ tester.patchNode(hosts0.get(0), (host) -> host.withSwitchHostname(switch0));
+ tester.patchNodes(hosts0.subList(1, hosts0.size()), (host) -> host.withSwitchHostname(switch1));
+
+ // Deploy application
+ tester.deploy(app, spec, capacity);
+ tester.assertSwitches(Set.of(switch0, switch1), app, spec.id());
+ metricsReporter.maintain();
+ assertEquals(0.25D, getMetric("nodes.exclusiveSwitchFraction", metric, MetricsReporter.dimensions(app, spec.id())).doubleValue(), Double.MIN_VALUE);
+
+ // More exclusive switches become available
+ List<Node> hosts1 = tester.makeReadyNodes(2, hostResources, NodeType.host, 5);
+ tester.activateTenantHosts();
+ String switch2 = "switch2";
+ String switch3 = "switch3";
+ tester.patchNode(hosts1.get(0), (host) -> host.withSwitchHostname(switch2));
+ tester.patchNode(hosts1.get(1), (host) -> host.withSwitchHostname(switch3));
+
+ // Another cluster is added
+ ClusterSpec spec2 = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("c2")).vespaVersion("1").build();
+ tester.deploy(app, spec2, capacity);
+ tester.assertSwitches(Set.of(switch0, switch1, switch2, switch3), app, spec2.id());
+ metricsReporter.maintain();
+ assertEquals(1D, getMetric("nodes.exclusiveSwitchFraction", metric, MetricsReporter.dimensions(app, spec2.id())).doubleValue(), Double.MIN_VALUE);
+ }
+
private Number getMetric(String name, TestMetric metric, Map<String, String> dimensions) {
List<TestMetric.TestContext> metrics = metric.context.get(name).stream()
.filter(ctx -> ctx.properties.entrySet().containsAll(dimensions.entrySet()))
@@ -306,4 +313,13 @@ public class MetricsReporterTest {
return Optional.empty();
}
+ private MetricsReporter metricsReporter(TestMetric metric, ProvisioningTester tester) {
+ return new MetricsReporter(tester.nodeRepository(),
+ metric,
+ tester.orchestrator(),
+ serviceMonitor,
+ () -> 42,
+ LONG_INTERVAL);
+ }
+
}