summaryrefslogtreecommitdiffstats
path: root/node-repository/src
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java34
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java33
5 files changed, 69 insertions, 14 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
index 0b3775a683f..8e90294b4a5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
@@ -174,7 +174,7 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
*
* @param e the exception indicating corruption
*/
- private void repair(CairoException e) {
+ private void repair(Exception e) {
log.log(Level.WARNING, "QuestDb seems corrupted, wiping data and starting over", e);
IOUtils.recursiveDeleteDir(new File(dataDir));
initializeDb();
@@ -182,22 +182,34 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
private void ensureExists(String table) {
SqlExecutionContext context = newContext();
+ if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) { // table exists
+ ensureUpdated(table, context);
+ } else {
+ create(table, context);
+ }
+ }
+
+ private void ensureUpdated(String table, SqlExecutionContext context) {
try (SqlCompiler compiler = new SqlCompiler(engine)) {
if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) {
ensureColumnExists("inService", "boolean", table, compiler, context); // TODO: Remove after December 2020
ensureColumnExists("stable", "boolean", table, compiler, context); // TODO: Remove after December 2020
}
- else {
- compiler.compile("create table " + table +
- " (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," +
- " application_generation long, inService boolean, stable boolean)" +
- " timestamp(at)" +
- "PARTITION BY DAY;",
- context);
- // We should do this if we get a version where selecting on strings work embedded, see below
- // compiler.compile("alter table " + tableName + " alter column hostname add index", context);
- }
+ } catch (SqlException e) {
+ repair(e);
+ }
+ }
+ private void create(String table, SqlExecutionContext context) {
+ try (SqlCompiler compiler = new SqlCompiler(engine)) {
+ compiler.compile("create table " + table +
+ " (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," +
+ " application_generation long, inService boolean, stable boolean)" +
+ " timestamp(at)" +
+ "PARTITION BY DAY;",
+ context);
+ // We should do this if we get a version where selecting on strings work embedded, see below
+ // compiler.compile("alter table " + tableName + " alter column hostname add index", context);
}
catch (SqlException e) {
throw new IllegalStateException("Could not create Quest db table '" + table + "'", e);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index bcdcd9054a7..4a5c28fe0c8 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import com.yahoo.collections.Pair;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ClusterSpec;
@@ -26,10 +27,12 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Duration;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
+import java.util.Set;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
@@ -41,6 +44,7 @@ import static com.yahoo.config.provision.NodeResources.DiskSpeed.any;
*/
public class MetricsReporter extends NodeRepositoryMaintainer {
+ private final Set<Pair<Metric.Context, String>> nonZeroMetrics = new HashSet<>();
private final Metric metric;
private final Orchestrator orchestrator;
private final ServiceMonitor serviceMonitor;
@@ -285,8 +289,14 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
}
private void setNonZero(String key, Number value, Metric.Context context) {
+ var metricKey = new Pair<>(context, key);
if (Double.compare(value.doubleValue(), 0.0) != 0) {
metric.set(key, value, context);
+ nonZeroMetrics.add(metricKey);
+ } else if (nonZeroMetrics.remove(metricKey)) {
+ // Need to set the metric to 0 after it has been set to non-zero, to avoid carrying
+ // a non-zero 'last' from earlier periods.
+ metric.set(key, value, context);
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 6d571fada9e..2999655e5fa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -153,7 +153,9 @@ public class NodeFailer extends NodeRepositoryMaintainer {
Map<Node, String> nodesByFailureReason = new HashMap<>();
for (Node node : activeNodes) {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
- nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
+ // Allow a grace period after node re-activation
+ if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd))
+ nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
}
else if (hostSuspended(node, activeNodes)) {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
index 42e26814d41..696853b2992 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
@@ -70,7 +70,7 @@ public class CuratorDatabaseClient {
private static final Path containerImagesPath = root.append("dockerImages");
private static final Path firmwareCheckPath = root.append("firmwareCheck");
- private static final Duration defaultLockTimeout = Duration.ofMinutes(2);
+ private static final Duration defaultLockTimeout = Duration.ofMinutes(6);
private final NodeSerializer nodeSerializer;
private final CuratorDatabase db;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index d403affc292..d4dbc6f55a5 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -10,6 +10,7 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Report;
import com.yahoo.vespa.hosted.provision.node.Reports;
import org.junit.Test;
@@ -233,7 +234,7 @@ public class NodeFailerTest {
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state());
-
+
String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname();
tester.serviceMonitor.setHostDown(downHost1);
@@ -309,6 +310,36 @@ public class NodeFailerTest {
}
@Test
+ public void re_activate_grace_period_test() {
+ NodeFailTester tester = NodeFailTester.withTwoApplications();
+ String downNode = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
+
+ tester.serviceMonitor.setHostDown(downNode);
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+
+ tester.clock.advance(Duration.ofMinutes(75));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state());
+
+ // Re-activate the node. It is still down, but should not be failed out until the grace period has passed again
+ tester.nodeRepository.reactivate(downNode, Agent.system, getClass().getSimpleName());
+ tester.clock.advance(Duration.ofMinutes(30));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+
+ tester.clock.advance(Duration.ofMinutes(45));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state());
+ }
+
+ @Test
public void node_failing_can_allocate_spare() {
var resources = new NodeResources(1, 20, 15, 1);
Capacity capacity = Capacity.from(new ClusterResources(3, 1, resources), false, true);