aboutsummaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorTorbjørn Smørgrav <smorgrav@users.noreply.github.com>2018-05-31 13:55:38 +0200
committerGitHub <noreply@github.com>2018-05-31 13:55:38 +0200
commit73883cb49d69fd738f1dfb29c8584b98b345e3b2 (patch)
tree646d9f394cb1047aeef7b99728704e9e013d5b5d /node-admin
parent8f0eb705f936a383fde4e89f3daab752a8537ef7 (diff)
parent995c4136aad9f415fe8bf5a7fffd3a0d53a3ea77 (diff)
Merge pull request #6004 from vespa-engine/smorgrav/yamaschecks
Yamas config written by node-agent
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java105
1 files changed, 85 insertions, 20 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
index 0c5dd72c968..7f2d1f1eff7 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
@@ -5,16 +5,19 @@ import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.collections.Pair;
+import com.yahoo.config.provision.NodeType;
import com.yahoo.io.IOUtils;
import com.yahoo.system.ProcessExecuter;
import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.dockerapi.metrics.CounterWrapper;
import com.yahoo.vespa.hosted.dockerapi.metrics.Dimensions;
+import com.yahoo.vespa.hosted.dockerapi.metrics.GaugeWrapper;
import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper;
import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
import com.yahoo.vespa.hosted.node.admin.docker.DockerOperations;
import com.yahoo.vespa.hosted.node.admin.logging.FilebeatConfigProvider;
import com.yahoo.vespa.hosted.node.admin.component.Environment;
+import com.yahoo.vespa.hosted.node.admin.task.util.file.IOExceptionUtil;
import com.yahoo.vespa.hosted.node.admin.util.PrefixLogger;
import com.yahoo.vespa.hosted.node.admin.util.SecretAgentCheckConfig;
@@ -46,6 +49,7 @@ public class StorageMaintainer {
private static final ContainerName NODE_ADMIN = new ContainerName("node-admin");
private static final ObjectMapper objectMapper = new ObjectMapper();
+ private final GaugeWrapper numberOfCoredumpsOnHost;
private final CounterWrapper numberOfNodeAdminMaintenanceFails;
private final DockerOperations dockerOperations;
private final ProcessExecuter processExecuter;
@@ -54,7 +58,6 @@ public class StorageMaintainer {
private final Map<ContainerName, MaintenanceThrottler> maintenanceThrottlerByContainerName = new ConcurrentHashMap<>();
-
public StorageMaintainer(DockerOperations dockerOperations, ProcessExecuter processExecuter, MetricReceiverWrapper metricReceiver, Environment environment, Clock clock) {
this.dockerOperations = dockerOperations;
this.processExecuter = processExecuter;
@@ -63,44 +66,98 @@ public class StorageMaintainer {
Dimensions dimensions = new Dimensions.Builder().add("role", "docker").build();
numberOfNodeAdminMaintenanceFails = metricReceiver.declareCounter(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "nodes.maintenance.fails");
+ numberOfCoredumpsOnHost = metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "nodes.coredumps");
}
public void writeMetricsConfig(ContainerName containerName, NodeSpec node) {
- final Path yamasAgentFolder = environment.pathInNodeAdminFromPathInNode(
- containerName, Paths.get("/etc/yamas-agent/"));
-
- Path vespaCheckPath = environment.pathInNodeUnderVespaHome("libexec/yms/yms_check_vespa");
- SecretAgentCheckConfig vespaSchedule = new SecretAgentCheckConfig("vespa", 60, vespaCheckPath, "all")
- .withTag("parentHostname", environment.getParentHostHostname());
+ List<SecretAgentCheckConfig> configs = new ArrayList<>();
+ // host-life
Path hostLifeCheckPath = environment.pathInNodeUnderVespaHome("libexec/yms/yms_check_host_life");
- SecretAgentCheckConfig hostLifeSchedule = new SecretAgentCheckConfig("host-life", 60, hostLifeCheckPath)
- .withTag("namespace", "Vespa")
+ SecretAgentCheckConfig hostLifeSchedule = new SecretAgentCheckConfig("host-life", 60, hostLifeCheckPath);
+ configs.add(annotatedCheck(node, hostLifeSchedule));
+
+ // ntp
+ Path ntpCheckPath = environment.pathInNodeUnderVespaHome("libexec/yms/yms_check_ntp");
+ SecretAgentCheckConfig ntpSchedule = new SecretAgentCheckConfig("ntp", 60, ntpCheckPath);
+ configs.add(annotatedCheck(node, ntpSchedule));
+
+ // coredumps (except for the done coredumps which is handled by the host)
+ Path coredumpCheckPath = environment.pathInNodeUnderVespaHome("libexec/yms/yms_check_coredumps");
+ SecretAgentCheckConfig coredumpSchedule = new SecretAgentCheckConfig("system-coredumps-processing", 300,
+ coredumpCheckPath, "--application", "system-coredumps-processing", "--lastmin",
+ "129600", "--crit", "1", "--coredir", environment.pathInNodeUnderVespaHome("var/crash/processing").toString());
+ configs.add(annotatedCheck(node, coredumpSchedule));
+
+ if (node.getNodeType() != NodeType.config) {
+ // vespa-health
+ Path vespaHealthCheckPath = environment.pathInNodeUnderVespaHome("libexec/yms/yms_check_vespa_health");
+ SecretAgentCheckConfig vespaHealthSchedule = new SecretAgentCheckConfig("vespa-health", 60, vespaHealthCheckPath, "all");
+ configs.add(annotatedCheck(node, vespaHealthSchedule));
+
+ // vespa
+ Path vespaCheckPath = environment.pathInNodeUnderVespaHome("libexec/yms/yms_check_vespa");
+ SecretAgentCheckConfig vespaSchedule = new SecretAgentCheckConfig("vespa", 60, vespaCheckPath, "all");
+ configs.add(annotatedCheck(node, vespaSchedule));
+ }
+
+ if (node.getNodeType() == NodeType.config) {
+ // configserver
+ Path configServerCheckPath = environment.pathInNodeUnderVespaHome("libexec/yms/yms_check_ymonsb2");
+ SecretAgentCheckConfig configServerSchedule = new SecretAgentCheckConfig("configserver", 60,
+ configServerCheckPath, "-zero", "configserver");
+ configs.add(annotatedCheck(node, configServerSchedule));
+
+ //zkbackupage
+ Path zkbackupCheckPath = environment.pathInNodeUnderVespaHome("libexec/yamas2/yms_check_file_age.py");
+ SecretAgentCheckConfig zkbackupSchedule = new SecretAgentCheckConfig("zkbackupage", 300,
+ zkbackupCheckPath, "-f", environment.pathInNodeUnderVespaHome("var/vespa-hosted/zkbackup.stat").toString(),
+ "-m", "150", "-a", "config-zkbackupage");
+ configs.add(annotatedCheck(node, zkbackupSchedule));
+ }
+
+ if (node.getNodeType() == NodeType.proxy) {
+ //routing-configage
+ Path routingAgeCheckPath = environment.pathInNodeUnderVespaHome("libexec/yamas2/yms_check_file_age.py");
+ SecretAgentCheckConfig routingAgeSchedule = new SecretAgentCheckConfig("routing-configage", 60,
+ routingAgeCheckPath, "-f", environment.pathInNodeUnderVespaHome("var/vespa-hosted/routing/nginx.conf").toString(),
+ "-m", "90", "-a", "routing-configage");
+ configs.add(annotatedCheck(node, routingAgeSchedule));
+
+ //ssl-check
+ Path sslCheckPath = environment.pathInNodeUnderVespaHome("libexec/yms/yms_check_ssl_status");
+ SecretAgentCheckConfig sslSchedule = new SecretAgentCheckConfig("ssl-status", 300,
+ sslCheckPath, "-e", "localhost", "-p", "4443", "-t", "30");
+ configs.add(annotatedCheck(node, sslSchedule));
+ }
+
+ // Write config and restart yamas-agent
+ Path yamasAgentFolder = environment.pathInNodeAdminFromPathInNode(containerName, Paths.get("/etc/yamas-agent/"));
+ configs.forEach(s -> IOExceptionUtil.uncheck(() -> s.writeTo(yamasAgentFolder)));
+ final String[] restartYamasAgent = new String[]{"service", "yamas-agent", "restart"};
+ dockerOperations.executeCommandInContainerAsRoot(containerName, restartYamasAgent);
+ }
+
+ private SecretAgentCheckConfig annotatedCheck(NodeSpec node, SecretAgentCheckConfig check) {
+ check.withTag("namespace", "Vespa")
.withTag("role", "tenants")
.withTag("flavor", node.getFlavor())
.withTag("canonicalFlavor", node.getCanonicalFlavor())
.withTag("state", node.getState().toString())
.withTag("zone", environment.getZone())
.withTag("parentHostname", environment.getParentHostHostname());
- node.getOwner().ifPresent(owner -> hostLifeSchedule
+ node.getOwner().ifPresent(owner -> check
.withTag("tenantName", owner.getTenant())
.withTag("app", owner.getApplication() + "." + owner.getInstance())
.withTag("applicationName", owner.getApplication())
.withTag("instanceName", owner.getInstance())
.withTag("applicationId", owner.getTenant() + "." + owner.getApplication() + "." + owner.getInstance()));
- node.getMembership().ifPresent(membership -> hostLifeSchedule
+ node.getMembership().ifPresent(membership -> check
.withTag("clustertype", membership.getClusterType())
.withTag("clusterid", membership.getClusterId()));
- node.getVespaVersion().ifPresent(version -> hostLifeSchedule.withTag("vespaVersion", version));
+ node.getVespaVersion().ifPresent(version -> check.withTag("vespaVersion", version));
- try {
- vespaSchedule.writeTo(yamasAgentFolder);
- hostLifeSchedule.writeTo(yamasAgentFolder);
- final String[] restartYamasAgent = new String[]{"service", "yamas-agent", "restart"};
- dockerOperations.executeCommandInContainerAsRoot(containerName, restartYamasAgent);
- } catch (IOException e) {
- throw new RuntimeException("Failed to write secret-agent schedules for " + containerName, e);
- }
+ return check;
}
public void writeFilebeatConfig(ContainerName containerName, NodeSpec node) {
@@ -218,6 +275,14 @@ public class StorageMaintainer {
* @param force Set to true to bypass throttling
*/
public void handleCoreDumpsForContainer(ContainerName containerName, NodeSpec node, boolean force) {
+ // Sample number of coredumps on the host
+ try {
+ numberOfCoredumpsOnHost.sample(Files.list(environment.pathInNodeAdminToDoneCoredumps()).count());
+ } catch (IOException e) {
+ // Ignore for now - this is either test or a misconfiguration
+ }
+
+ // Return early if throttled
if (! getMaintenanceThrottlerFor(containerName).shouldHandleCoredumpsNow() && !force) return;
MaintainerExecutor maintainerExecutor = new MaintainerExecutor();