diff options
author | Ola Aunrønning <olaa@verizonmedia.com> | 2019-09-18 15:46:24 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-09-18 15:46:24 +0200 |
commit | 48272e7af2589989361a4bdc13f57fddc7f3a09e (patch) | |
tree | 9b42951db85ad16155da048f254ea3591fb2084f | |
parent | 9f2977281ae8d719f20f3d69951806ca92b34fab (diff) | |
parent | d7e9109f6ab6a147efcbfc99093c48cdad8d0e0a (diff) |
Merge pull request #10701 from vespa-engine/olaa/more-fault-tolerant-metric-retrieval
Set request timeout
2 files changed, 19 insertions, 12 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterMetricsRetriever.java b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterMetricsRetriever.java index c23af021e3b..ffa820bd433 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterMetricsRetriever.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterMetricsRetriever.java @@ -6,6 +6,8 @@ import com.yahoo.slime.ArrayTraverser; import com.yahoo.slime.Inspector; import com.yahoo.slime.Slime; import com.yahoo.vespa.config.SlimeUtils; +import com.yahoo.yolean.Exceptions; +import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; @@ -39,7 +41,12 @@ public class ClusterMetricsRetriever { private static final List<String> WANTED_METRIC_SERVICES = List.of(VESPA_CONTAINER, VESPA_QRSERVER, VESPA_DISTRIBUTOR); - private static final CloseableHttpClient httpClient = VespaHttpClientBuilder.create().build(); + private static final CloseableHttpClient httpClient = VespaHttpClientBuilder.create() + .setDefaultRequestConfig(RequestConfig.custom() + .setConnectTimeout(10 * 1000) + .setSocketTimeout(10 * 1000) + .build()) + .build(); /** * Call the metrics API on each host and aggregate the metrics @@ -88,7 +95,7 @@ public class ClusterMetricsRetriever { return slime; } catch (IOException e) { // Usually caused by applications being deleted during metric retrieval - log.warning("Was unable to fetch metrics from " + hostURI); + log.warning("Was unable to fetch metrics from " + hostURI + " : " + Exceptions.toMessageString(e)); return new Slime(); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java index feee2edf896..7c060c599ef 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java @@ -46,13 +46,13 @@ public class DeploymentMetricsMaintainer extends Maintainer { // Run parallel stream inside a custom ForkJoinPool so that we can control the number of threads used ForkJoinPool pool = new ForkJoinPool(applicationsToUpdateInParallel); - pool.submit(() -> { + pool.submit(() -> applicationList.parallelStream().forEach(application -> { - try { - applications.lockIfPresent(application.id(), locked -> - applications.store(locked.with(controller().metrics().getApplicationMetrics(application.id())))); + applications.lockIfPresent(application.id(), locked -> + applications.store(locked.with(controller().metrics().getApplicationMetrics(application.id())))); - for (Deployment deployment : application.deployments().values()) { + for (Deployment deployment : application.deployments().values()) { + try { if (deployment.version().getMajor() < 7) continue; var collectedMetrics = controller().metrics().getDeploymentMetrics(application.id(), deployment.zone()); var now = controller().clock().instant(); @@ -70,13 +70,13 @@ public class DeploymentMetricsMaintainer extends Maintainer { .recordActivityAt(now, existingDeployment.zone())); }); + } catch (Exception e) { + failures.incrementAndGet(); + lastException.set(e); } - } catch (Exception e) { - failures.incrementAndGet(); - lastException.set(e); } - }); - }); + }) + ); pool.shutdown(); try { pool.awaitTermination(30, TimeUnit.MINUTES); |