summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHÃ¥kon Hallingstad <hakon.hallingstad@gmail.com>2019-11-14 11:39:53 +0100
committerGitHub <noreply@github.com>2019-11-14 11:39:53 +0100
commiteefeb095cb136450e69dabd25b43250775cb98d7 (patch)
treecf407f86fd1ce4b7ee94223666a8c04959bfc02e /node-repository
parent4f26bb8dacc218bb9d95fdfd4d06f021aa35a021 (diff)
parentb4dd01400a045c16f2ff0302f735d2c62d4d1f57 (diff)
Merge pull request #11284 from vespa-engine/hakonhall/allow-overriding-noderepositorymaintenance-durations-with-flag
Add flag to control reboot interval
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java48
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java36
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Generation.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java105
4 files changed, 123 insertions, 70 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
index da64ab6f41d..d81d01bc941 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
@@ -2,6 +2,9 @@
package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.Flavor;
+import com.yahoo.vespa.flags.FlagSource;
+import com.yahoo.vespa.flags.Flags;
+import com.yahoo.vespa.flags.IntFlag;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.History;
@@ -9,8 +12,10 @@ import com.yahoo.vespa.hosted.provision.node.filter.NodeListFilter;
import java.time.Clock;
import java.time.Duration;
+import java.util.Comparator;
import java.util.EnumSet;
import java.util.List;
+import java.util.Optional;
import java.util.Random;
import java.util.stream.Collectors;
@@ -22,14 +27,14 @@ import java.util.stream.Collectors;
* @author bratseth
*/
public class NodeRebooter extends Maintainer {
-
- private final Duration rebootInterval;
+
+ private final IntFlag rebootIntervalInDays;
private final Clock clock;
private final Random random;
- NodeRebooter(NodeRepository nodeRepository, Clock clock, Duration rebootInterval) {
- super(nodeRepository, min(Duration.ofMinutes(25), rebootInterval));
- this.rebootInterval = rebootInterval;
+ NodeRebooter(NodeRepository nodeRepository, Clock clock, FlagSource flagSource) {
+ super(nodeRepository, Duration.ofMinutes(25));
+ this.rebootIntervalInDays = Flags.REBOOT_INTERVAL_IN_DAYS.bindTo(flagSource);
this.clock = clock;
this.random = new Random(clock.millis()); // seed with clock for test determinism
}
@@ -37,9 +42,7 @@ public class NodeRebooter extends Maintainer {
@Override
protected void maintain() {
// Reboot candidates: Nodes in long-term states, which we know can safely orchestrate a reboot
- EnumSet<Node.State> targetStates = EnumSet.of(Node.State.active, Node.State.ready);
- List<Node> nodesToReboot = nodeRepository().getNodes().stream()
- .filter(node -> targetStates.contains(node.state()))
+ List<Node> nodesToReboot = nodeRepository().getNodes(Node.State.active, Node.State.ready).stream()
.filter(node -> node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER)
.filter(this::shouldReboot)
.collect(Collectors.toList());
@@ -49,13 +52,32 @@ public class NodeRebooter extends Maintainer {
}
private boolean shouldReboot(Node node) {
- var rebootEvents = EnumSet.of(History.Event.Type.rebooted, History.Event.Type.osUpgraded);
- var acceptableRebootInstant = clock.instant().minus(rebootInterval);
+ if (node.status().reboot().pending()) return false;
+
+ var rebootEvents = EnumSet.of(History.Event.Type.provisioned, History.Event.Type.rebooted, History.Event.Type.osUpgraded);
+ var rebootInterval = Duration.ofDays(rebootIntervalInDays.value());
+
+ Optional<Duration> overdue = node.history().events().stream()
+ .filter(event -> rebootEvents.contains(event.type()))
+ .map(History.Event::at)
+ .max(Comparator.naturalOrder())
+ .map(lastReboot -> Duration.between(lastReboot, clock.instant()).minus(rebootInterval));
- if (rebootEvents.stream().anyMatch(event -> node.history().hasEventAfter(event, acceptableRebootInstant)))
+ if (overdue.isEmpty()) // should never happen as all !docker-container should have provisioned timestamp
+ return random.nextDouble() < interval().getSeconds() / (double) rebootInterval.getSeconds();
+
+ if (overdue.get().isNegative())
return false;
- else // schedule with a probability such that reboots of nodes are spread roughly over the reboot interval
- return random.nextDouble() < (double) interval().getSeconds() / (double)rebootInterval.getSeconds();
+
+ // Use a probability such that each maintain() schedules the same number of reboots,
+ // as long as 0 <= overdue <= rebootInterval, with the last maintain() in that interval
+ // naturally scheduling the remaining with probability 1.
+
+ int configServers = 3;
+ long secondsRemaining = Math.max(0, rebootInterval.getSeconds() - overdue.get().getSeconds());
+ double runsRemaining = configServers * secondsRemaining / (double) interval().getSeconds();
+ double probability = 1 / (1 + runsRemaining);
+ return random.nextDouble() < probability;
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 7c5e529fb03..94e3318ac9c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -66,23 +66,23 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
ProvisionServiceProvider provisionServiceProvider, FlagSource flagSource) {
DefaultTimes defaults = new DefaultTimes(zone);
- nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv().orElse(defaults.throttlePolicy), metric);
- periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, nodeRepository, defaults.redeployMaintainerInterval, durationFromEnv("periodic_redeploy_interval").orElse(defaults.periodicRedeployInterval));
- operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, durationFromEnv("operator_change_redeploy_interval").orElse(defaults.operatorChangeRedeployInterval));
- reservationExpirer = new ReservationExpirer(nodeRepository, clock, durationFromEnv("reservation_expiry").orElse(defaults.reservationExpiry));
- retiredExpirer = new RetiredExpirer(nodeRepository, orchestrator, deployer, clock, durationFromEnv("retired_interval").orElse(defaults.retiredInterval), durationFromEnv("retired_expiry").orElse(defaults.retiredExpiry));
- inactiveExpirer = new InactiveExpirer(nodeRepository, clock, durationFromEnv("inactive_expiry").orElse(defaults.inactiveExpiry));
- failedExpirer = new FailedExpirer(nodeRepository, zone, clock, durationFromEnv("failed_expirer_interval").orElse(defaults.failedExpirerInterval));
- dirtyExpirer = new DirtyExpirer(nodeRepository, clock, durationFromEnv("dirty_expiry").orElse(defaults.dirtyExpiry));
- provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, durationFromEnv("provisioned_expiry").orElse(defaults.provisionedExpiry));
- nodeRebooter = new NodeRebooter(nodeRepository, clock, durationFromEnv("reboot_interval").orElse(defaults.rebootInterval));
- metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, durationFromEnv("metrics_interval").orElse(defaults.metricsInterval));
- infrastructureProvisioner = new InfrastructureProvisioner(nodeRepository, infraDeployer, durationFromEnv("infrastructure_provision_interval").orElse(defaults.infrastructureProvisionInterval));
+ nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, defaults.failGrace, clock, orchestrator, throttlePolicyFromEnv().orElse(defaults.throttlePolicy), metric);
+ periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, nodeRepository, defaults.redeployMaintainerInterval, defaults.periodicRedeployInterval);
+ operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, defaults.operatorChangeRedeployInterval);
+ reservationExpirer = new ReservationExpirer(nodeRepository, clock, defaults.reservationExpiry);
+ retiredExpirer = new RetiredExpirer(nodeRepository, orchestrator, deployer, clock, defaults.retiredInterval, defaults.retiredExpiry);
+ inactiveExpirer = new InactiveExpirer(nodeRepository, clock, defaults.inactiveExpiry);
+ failedExpirer = new FailedExpirer(nodeRepository, zone, clock, defaults.failedExpirerInterval);
+ dirtyExpirer = new DirtyExpirer(nodeRepository, clock, defaults.dirtyExpiry);
+ provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, defaults.provisionedExpiry);
+ nodeRebooter = new NodeRebooter(nodeRepository, clock, flagSource);
+ metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, defaults.metricsInterval);
+ infrastructureProvisioner = new InfrastructureProvisioner(nodeRepository, infraDeployer, defaults.infrastructureProvisionInterval);
loadBalancerExpirer = provisionServiceProvider.getLoadBalancerService().map(lbService ->
- new LoadBalancerExpirer(nodeRepository, durationFromEnv("load_balancer_expirer_interval").orElse(defaults.loadBalancerExpirerInterval), lbService));
+ new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService));
dynamicProvisioningMaintainer = provisionServiceProvider.getHostProvisioner().map(hostProvisioner ->
- new DynamicProvisioningMaintainer(nodeRepository, durationFromEnv("host_provisioner_interval").orElse(defaults.dynamicProvisionerInterval), hostProvisioner, flagSource));
- capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, durationFromEnv("capacity_report_interval").orElse(defaults.capacityReportInterval));
+ new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource));
+ capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, defaults.capacityReportInterval);
osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval);
rebalancer = new Rebalancer(nodeRepository, provisionServiceProvider.getHostResourcesCalculator(), provisionServiceProvider.getHostProvisioner(), metric, clock, defaults.rebalancerInterval);
@@ -111,10 +111,6 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
rebalancer.deconstruct();
}
- private static Optional<Duration> durationFromEnv(String envVariable) {
- return Optional.ofNullable(System.getenv(envPrefix + envVariable)).map(Long::parseLong).map(Duration::ofSeconds);
- }
-
private static Optional<NodeFailer.ThrottlePolicy> throttlePolicyFromEnv() {
String policyName = System.getenv(envPrefix + "throttle_policy");
try {
@@ -145,7 +141,6 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
private final Duration failedExpirerInterval;
private final Duration dirtyExpiry;
private final Duration provisionedExpiry;
- private final Duration rebootInterval;
private final Duration capacityReportInterval;
private final Duration metricsInterval;
private final Duration retiredInterval;
@@ -165,7 +160,6 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
operatorChangeRedeployInterval = Duration.ofMinutes(1);
failedExpirerInterval = Duration.ofMinutes(10);
provisionedExpiry = Duration.ofHours(4);
- rebootInterval = Duration.ofDays(30);
capacityReportInterval = Duration.ofMinutes(10);
metricsInterval = Duration.ofMinutes(1);
infrastructureProvisionInterval = Duration.ofMinutes(1);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Generation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Generation.java
index 77d0edd9d2e..8572a2f3f4d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Generation.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Generation.java
@@ -29,6 +29,10 @@ public class Generation {
return current;
}
+ public boolean pending() {
+ return current < wanted;
+ }
+
public Generation withIncreasedWanted() {
return new Generation(wanted + 1, current);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java
index a97d0aeb9cf..bc97491f828 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java
@@ -3,6 +3,8 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.component.Version;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.vespa.flags.Flags;
+import com.yahoo.vespa.flags.InMemoryFlagSource;
import com.yahoo.vespa.hosted.provision.Node;
import org.junit.Test;
@@ -19,51 +21,82 @@ public class NodeRebooterTest {
@Test
public void testRebootScheduling() {
- Duration rebootInterval = Duration.ofMinutes(250);
- MaintenanceTester tester = new MaintenanceTester();
- tester.createReadyTenantNodes(15);
+ var rebootInterval = Duration.ofDays(30);
+ var flagSource = new InMemoryFlagSource().withIntFlag(Flags.REBOOT_INTERVAL_IN_DAYS.id(), (int) rebootInterval.toDays());
+ var tester = new MaintenanceTester();
tester.createReadyHostNodes(15);
- // New non-host nodes are rebooted when transitioning from dirty to ready. Advance the time so that additional
- // reboots will be performed.
+ NodeRebooter rebooter = new NodeRebooter(tester.nodeRepository, tester.clock, flagSource);
+
+ assertReadyHosts(15, tester, 0L);
+
+ // No reboots within 0x-1x reboot interval
tester.clock.advance(rebootInterval);
-
- NodeRebooter rebooter = new NodeRebooter(tester.nodeRepository, tester.clock, rebootInterval);
-
- maintenanceIntervals(rebooter, tester, 1);
- assertEquals("All tenant nodes have reboot scheduled",
- 15,
- withCurrentRebootGeneration(2L, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready)).size());
- assertEquals("No nodes have 2 reboots scheduled",
- 0,
- withCurrentRebootGeneration(3L, tester.nodeRepository.getNodes(Node.State.ready)).size());
-
- maintenanceIntervals(rebooter, tester, 11);
- assertEquals("Reboot interval is 10x iteration interval, so tenant nodes are now rebooted 3 times",
- 15,
- withCurrentRebootGeneration(3L, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready)).size());
- assertEquals("Reboot interval is 10x iteration interval, so host nodes are now rebooted twice",
- 15,
- withCurrentRebootGeneration(2L, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready)).size());
+ rebooter.maintain();
+ simulateReboot(tester);
+ assertReadyHosts(15, tester, 0L);
+ // All nodes/hosts reboots within 1x-2x reboot interval
+ tester.clock.advance(rebootInterval);
+ rebooter.maintain();
+ simulateReboot(tester);
+ assertReadyHosts(15, tester, 1L);
+
+ // OS upgrade just before reboots would have been scheduled again
+ tester.clock.advance(rebootInterval);
scheduleOsUpgrade(tester);
- maintenanceIntervals(rebooter, tester, 8);
- assertEquals(15, withCurrentRebootGeneration(2L, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready)).size());
simulateOsUpgrade(tester);
- maintenanceIntervals(rebooter, tester, 1);
- assertEquals("Host nodes are not rebooted as they recently rebooted due to OS upgrade",
- 15, withCurrentRebootGeneration(2L, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready)).size());
+ rebooter.maintain();
+ simulateReboot(tester);
+ assertReadyHosts(15, tester, 1L);
+
+ // OS upgrade counts as reboot, so within 0x-1x there is no reboots
+ tester.clock.advance(rebootInterval);
+ rebooter.maintain();
+ simulateReboot(tester);
+ assertReadyHosts(15, tester, 1L);
+
+ // OS upgrade counts as reboot, but within 1x-2x reboots are scheduled again
+ tester.clock.advance(rebootInterval);
+ rebooter.maintain();
+ simulateReboot(tester);
+ assertReadyHosts(15, tester, 2L);
}
-
- private void maintenanceIntervals(NodeRebooter rebooter, MaintenanceTester tester, int iterations) {
- for (int i = 0; i < iterations; i++) {
- tester.clock.advance(Duration.ofMinutes(25));
- for (int j = 0; j < 60; j++) { // multiple runs to remove effects from the probabilistic smoothing in the reboot maintainer
- rebooter.maintain();
- simulateReboot(tester);
+
+ @Test
+ public void testRebootScheduledEvenWithSmallProbability() {
+ Duration rebootInterval = Duration.ofDays(30);
+ var flagSource = new InMemoryFlagSource().withIntFlag(Flags.REBOOT_INTERVAL_IN_DAYS.id(), (int) rebootInterval.toDays());
+ var tester = new MaintenanceTester();
+ tester.createReadyHostNodes(2);
+ NodeRebooter rebooter = new NodeRebooter(tester.nodeRepository, tester.clock, flagSource);
+
+ assertReadyHosts(2, tester, 0L);
+
+ // No reboots within 0x-1x reboot interval
+ tester.clock.advance(rebootInterval);
+ rebooter.maintain();
+ simulateReboot(tester);
+ assertReadyHosts(2, tester, 0L);
+
+ // Advancing just a little bit into the 1x-2x interval, there is a >0 probability of
+ // rebooting a host. Run until all have been scheduled.
+ tester.clock.advance(Duration.ofMinutes(25));
+ for (int i = 0;; ++i) {
+ rebooter.maintain();
+ simulateReboot(tester);
+ List<Node> nodes = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready);
+ int count = withCurrentRebootGeneration(1L, nodes).size();
+ if (count == 2) {
+ break;
}
}
}
-
+
+ private void assertReadyHosts(int expectedCount, MaintenanceTester tester, long generation) {
+ List<Node> nodes = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready);
+ assertEquals(expectedCount, withCurrentRebootGeneration(generation, nodes).size());
+ }
+
/** Set current reboot generation to the wanted reboot generation whenever it is larger (i.e record a reboot) */
private void simulateReboot(MaintenanceTester tester) {
for (Node node : tester.nodeRepository.getNodes(Node.State.ready, Node.State.active)) {