diff options
6 files changed, 99 insertions, 16 deletions
diff --git a/config-provisioning/src/main/java/com/yahoo/config/provision/NodeType.java b/config-provisioning/src/main/java/com/yahoo/config/provision/NodeType.java index 8553b07d683..875fa83c0bb 100644 --- a/config-provisioning/src/main/java/com/yahoo/config/provision/NodeType.java +++ b/config-provisioning/src/main/java/com/yahoo/config/provision/NodeType.java @@ -9,18 +9,27 @@ package com.yahoo.config.provision; public enum NodeType { /** A host of a set of (docker) tenant nodes */ - host, + host(true), /** Nodes running the shared proxy layer */ - proxy, + proxy(false), /** A node to be assigned to a tenant to run application workloads */ - tenant, + tenant(false), /** A config server */ - config, + config(false), /** A host of a (docker) config server node */ - confighost + confighost(true); + private boolean isDockerHost; + + NodeType(boolean isDockerHost) { + this.isDockerHost = isDockerHost; + } + + public boolean isDockerHost() { + return isDockerHost; + } } diff --git a/configdefinitions/src/vespa/configserver.def b/configdefinitions/src/vespa/configserver.def index f730758d58f..de5132b8957 100644 --- a/configdefinitions/src/vespa/configserver.def +++ b/configdefinitions/src/vespa/configserver.def @@ -48,3 +48,6 @@ loadBalancerAddress string default="" # File distribution disableFiledistributor bool default=true + +# Node admin +nodeAdminInContainer bool default=true diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index a21bd3ff1a1..a2a7aa2545a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.cloud.config.ConfigserverConfig; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostLivenessTracker; @@ -57,12 +58,14 @@ public class NodeFailer extends Maintainer { private final Instant constructionTime; private final ThrottlePolicy throttlePolicy; private final Metric metric; + private final ConfigserverConfig configserverConfig; public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, NodeRepository nodeRepository, Duration downTimeLimit, Clock clock, Orchestrator orchestrator, ThrottlePolicy throttlePolicy, Metric metric, - JobControl jobControl) { + JobControl jobControl, + ConfigserverConfig configserverConfig) { // check ping status every five minutes, but at least twice as often as the down time limit super(nodeRepository, min(downTimeLimit.dividedBy(2), Duration.ofMinutes(5)), jobControl); this.deployer = deployer; @@ -74,6 +77,7 @@ public class NodeFailer extends Maintainer { this.constructionTime = clock.instant(); this.throttlePolicy = throttlePolicy; this.metric = metric; + this.configserverConfig = configserverConfig; } @Override @@ -126,7 +130,7 @@ public class NodeFailer extends Maintainer { Map<Node, String> nodesByFailureReason = new HashMap<>(); for (Node node : nodeRepository().getNodes(Node.State.ready)) { - if (! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) { + if (expectConfigRequests(node) && ! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) { nodesByFailureReason.put(node, "Not receiving config requests from node"); } else if (node.status().hardwareFailureDescription().isPresent()) { nodesByFailureReason.put(node, "Node has hardware failure"); @@ -137,6 +141,10 @@ public class NodeFailer extends Maintainer { return nodesByFailureReason; } + private boolean expectConfigRequests(Node node) { + return !node.type().isDockerHost() || configserverConfig.nodeAdminInContainer(); + } + private boolean hasNodeRequestedConfigAfter(Node node, Instant instant) { return !wasMadeReadyBefore(node, instant) || hasRecordedRequestAfter(node, instant); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index be792630445..7b0606b809b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.google.inject.Inject; +import com.yahoo.cloud.config.ConfigserverConfig; import com.yahoo.component.AbstractComponent; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Environment; @@ -53,17 +54,20 @@ public class NodeRepositoryMaintenance extends AbstractComponent { @Inject public NodeRepositoryMaintenance(NodeRepository nodeRepository, Deployer deployer, Curator curator, HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, - Zone zone, Orchestrator orchestrator, Metric metric) { - this(nodeRepository, deployer, curator, hostLivenessTracker, serviceMonitor, zone, Clock.systemUTC(), orchestrator, metric); + Zone zone, Orchestrator orchestrator, Metric metric, + ConfigserverConfig configserverConfig) { + this(nodeRepository, deployer, curator, hostLivenessTracker, serviceMonitor, zone, Clock.systemUTC(), + orchestrator, metric, configserverConfig); } public NodeRepositoryMaintenance(NodeRepository nodeRepository, Deployer deployer, Curator curator, - HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, - Zone zone, Clock clock, Orchestrator orchestrator, Metric metric) { + HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, + Zone zone, Clock clock, Orchestrator orchestrator, Metric metric, + ConfigserverConfig configserverConfig) { DefaultTimes defaults = new DefaultTimes(zone.environment()); jobControl = new JobControl(nodeRepository.database()); - nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv("throttle_policy").orElse(defaults.throttlePolicy), metric, jobControl); + nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv("throttle_policy").orElse(defaults.throttlePolicy), metric, jobControl, configserverConfig); periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, nodeRepository, durationFromEnv("periodic_redeploy_interval").orElse(defaults.periodicRedeployInterval), jobControl); operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, clock, durationFromEnv("operator_change_redeploy_interval").orElse(defaults.operatorChangeRedeployInterval), jobControl); zooKeeperAccessMaintainer = new ZooKeeperAccessMaintainer(nodeRepository, curator, durationFromEnv("zookeeper_access_maintenance_interval").orElse(defaults.zooKeeperAccessMaintenanceInterval), jobControl); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 5534c28cc1a..a03b06fda13 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.cloud.config.ConfigserverConfig; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ApplicationName; @@ -70,19 +71,29 @@ public class NodeFailTester { private final Orchestrator orchestrator; private final NodeRepositoryProvisioner provisioner; private final Curator curator; + private final ConfigserverConfig configserverConfig; private NodeFailTester() { + this(new ConfigserverConfig(new ConfigserverConfig.Builder())); + } + + private NodeFailTester(ConfigserverConfig configserverConfig) { clock = new ManualClock(); curator = new MockCurator(); nodeRepository = new NodeRepository(nodeFlavors, curator, clock, zone, new MockNameResolver().mockAnyLookup(), - new DockerImage("docker-registry.domain.tld:8080/dist/vespa")); + new DockerImage("docker-registry.domain.tld:8080/dist/vespa")); provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, zone); hostLivenessTracker = new TestHostLivenessTracker(clock); orchestrator = new OrchestratorMock(); + this.configserverConfig = configserverConfig; } - + public static NodeFailTester withTwoApplications() { - NodeFailTester tester = new NodeFailTester(); + return withTwoApplications(new ConfigserverConfig(new ConfigserverConfig.Builder())); + } + + public static NodeFailTester withTwoApplications(ConfigserverConfig configserverConfig) { + NodeFailTester tester = new NodeFailTester(configserverConfig); tester.createReadyNodes(16); tester.createHostNodes(3); @@ -184,7 +195,7 @@ public class NodeFailTester { } public NodeFailer createFailer() { - return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, downtimeLimitOneHour, clock, orchestrator, NodeFailer.ThrottlePolicy.hosted, metric, new JobControl(nodeRepository.database())); + return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, downtimeLimitOneHour, clock, orchestrator, NodeFailer.ThrottlePolicy.hosted, metric, new JobControl(nodeRepository.database()), configserverConfig); } public void allNodesMakeAConfigRequestExcept(Node ... deadNodeArray) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 6d41cfa08e5..63bc04ac671 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.cloud.config.ConfigserverConfig; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.applicationmodel.ServiceInstance; @@ -194,6 +195,53 @@ public class NodeFailerTest { } @Test + public void docker_host_failed_without_config_requests() { + NodeFailTester tester = NodeFailTester.withTwoApplications(); + + // For a day all nodes work so nothing happens + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { + tester.clock.advance(Duration.ofMinutes(interval)); + tester.allNodesMakeAConfigRequestExcept(); + tester.failer.run(); + assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); + assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size()); + } + + + // Two ready nodes and a ready docker node die, but only 2 of those are failed out + tester.clock.advance(Duration.ofMinutes(180)); + Node dockerHost = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).iterator().next(); + tester.allNodesMakeAConfigRequestExcept(dockerHost); + tester.failer.run(); + assertEquals( 2, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); + assertEquals( 1, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size()); + } + + @Test + public void not_failed_without_config_requests_if_node_admin_on_host() { + NodeFailTester tester = NodeFailTester.withTwoApplications( + new ConfigserverConfig(new ConfigserverConfig.Builder().nodeAdminInContainer(false))); + + // For a day all nodes work so nothing happens + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { + tester.clock.advance(Duration.ofMinutes(interval)); + tester.allNodesMakeAConfigRequestExcept(); + tester.failer.run(); + assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); + assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size()); + } + + + // Two ready nodes and a ready docker node die, but only 2 of those are failed out + tester.clock.advance(Duration.ofMinutes(180)); + Node dockerHost = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).iterator().next(); + tester.allNodesMakeAConfigRequestExcept(dockerHost); + tester.failer.run(); + assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); + assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size()); + } + + @Test public void failing_docker_hosts() { NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(7); |