diff options
Diffstat (limited to 'node-repository/src')
5 files changed, 69 insertions, 14 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java index 0b3775a683f..8e90294b4a5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java @@ -174,7 +174,7 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { * * @param e the exception indicating corruption */ - private void repair(CairoException e) { + private void repair(Exception e) { log.log(Level.WARNING, "QuestDb seems corrupted, wiping data and starting over", e); IOUtils.recursiveDeleteDir(new File(dataDir)); initializeDb(); @@ -182,22 +182,34 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { private void ensureExists(String table) { SqlExecutionContext context = newContext(); + if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) { // table exists + ensureUpdated(table, context); + } else { + create(table, context); + } + } + + private void ensureUpdated(String table, SqlExecutionContext context) { try (SqlCompiler compiler = new SqlCompiler(engine)) { if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) { ensureColumnExists("inService", "boolean", table, compiler, context); // TODO: Remove after December 2020 ensureColumnExists("stable", "boolean", table, compiler, context); // TODO: Remove after December 2020 } - else { - compiler.compile("create table " + table + - " (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," + - " application_generation long, inService boolean, stable boolean)" + - " timestamp(at)" + - "PARTITION BY DAY;", - context); - // We should do this if we get a version where selecting on strings work embedded, see below - // compiler.compile("alter table " + tableName + " alter column hostname add index", context); - } + } catch (SqlException e) { + repair(e); + } + } + private void create(String table, SqlExecutionContext context) { + try (SqlCompiler compiler = new SqlCompiler(engine)) { + compiler.compile("create table " + table + + " (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," + + " application_generation long, inService boolean, stable boolean)" + + " timestamp(at)" + + "PARTITION BY DAY;", + context); + // We should do this if we get a version where selecting on strings work embedded, see below + // compiler.compile("alter table " + tableName + " alter column hostname add index", context); } catch (SqlException e) { throw new IllegalStateException("Could not create Quest db table '" + table + "'", e); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index bcdcd9054a7..4a5c28fe0c8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.collections.Pair; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterSpec; @@ -26,10 +27,12 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor; import java.time.Duration; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.function.Function; import java.util.function.Supplier; import java.util.stream.Collectors; @@ -41,6 +44,7 @@ import static com.yahoo.config.provision.NodeResources.DiskSpeed.any; */ public class MetricsReporter extends NodeRepositoryMaintainer { + private final Set<Pair<Metric.Context, String>> nonZeroMetrics = new HashSet<>(); private final Metric metric; private final Orchestrator orchestrator; private final ServiceMonitor serviceMonitor; @@ -285,8 +289,14 @@ public class MetricsReporter extends NodeRepositoryMaintainer { } private void setNonZero(String key, Number value, Metric.Context context) { + var metricKey = new Pair<>(context, key); if (Double.compare(value.doubleValue(), 0.0) != 0) { metric.set(key, value, context); + nonZeroMetrics.add(metricKey); + } else if (nonZeroMetrics.remove(metricKey)) { + // Need to set the metric to 0 after it has been set to non-zero, to avoid carrying + // a non-zero 'last' from earlier periods. + metric.set(key, value, context); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 6d571fada9e..2999655e5fa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -153,7 +153,9 @@ public class NodeFailer extends NodeRepositoryMaintainer { Map<Node, String> nodesByFailureReason = new HashMap<>(); for (Node node : activeNodes) { if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { - nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); + // Allow a grace period after node re-activation + if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd)) + nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); } else if (hostSuspended(node, activeNodes)) { Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index 42e26814d41..696853b2992 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -70,7 +70,7 @@ public class CuratorDatabaseClient { private static final Path containerImagesPath = root.append("dockerImages"); private static final Path firmwareCheckPath = root.append("firmwareCheck"); - private static final Duration defaultLockTimeout = Duration.ofMinutes(2); + private static final Duration defaultLockTimeout = Duration.ofMinutes(6); private final NodeSerializer nodeSerializer; private final CuratorDatabase db; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index d403affc292..d4dbc6f55a5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -10,6 +10,7 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Report; import com.yahoo.vespa.hosted.provision.node.Reports; import org.junit.Test; @@ -233,7 +234,7 @@ public class NodeFailerTest { assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state()); - + String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname(); tester.serviceMonitor.setHostDown(downHost1); @@ -309,6 +310,36 @@ public class NodeFailerTest { } @Test + public void re_activate_grace_period_test() { + NodeFailTester tester = NodeFailTester.withTwoApplications(); + String downNode = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); + + tester.serviceMonitor.setHostDown(downNode); + tester.allNodesMakeAConfigRequestExcept(); + tester.runMaintainers(); + assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + + tester.clock.advance(Duration.ofMinutes(75)); + tester.allNodesMakeAConfigRequestExcept(); + tester.runMaintainers(); + assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state()); + + // Re-activate the node. It is still down, but should not be failed out until the grace period has passed again + tester.nodeRepository.reactivate(downNode, Agent.system, getClass().getSimpleName()); + tester.clock.advance(Duration.ofMinutes(30)); + tester.allNodesMakeAConfigRequestExcept(); + tester.runMaintainers(); + assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + + tester.clock.advance(Duration.ofMinutes(45)); + tester.allNodesMakeAConfigRequestExcept(); + tester.runMaintainers(); + assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state()); + } + + @Test public void node_failing_can_allocate_spare() { var resources = new NodeResources(1, 20, 15, 1); Capacity capacity = Capacity.from(new ClusterResources(3, 1, resources), false, true); |