diff options
author | hakonhall <hakon@yahoo-inc.com> | 2017-06-19 09:19:27 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-06-19 09:19:27 +0200 |
commit | ffad94583f16953baa5787cbab911ddd8b636d87 (patch) | |
tree | a8a0c79a67c51ae48bd9320107f519051a1e18bc /node-admin | |
parent | a533bd755946dd741a2b88d3a81f30f789fa1555 (diff) | |
parent | 08df5af4be85b9085dc68ca9c099bb8470f0d179 (diff) |
Merge pull request #2815 from yahoo/hakon/logging-tuning-in-nodeadmin
Logging tuning in NodeAdmin
Diffstat (limited to 'node-admin')
2 files changed, 30 insertions, 21 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java index b0bb5d1b7b1..e3473fc09b0 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -2,11 +2,11 @@ package com.yahoo.vespa.hosted.node.admin.nodeadmin; import com.yahoo.component.AbstractComponent; +import com.yahoo.log.LogLevel; import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec; import com.yahoo.vespa.hosted.node.admin.noderepository.NodeRepository; import com.yahoo.vespa.hosted.node.admin.orchestrator.Orchestrator; import com.yahoo.vespa.hosted.node.admin.orchestrator.OrchestratorException; -import com.yahoo.vespa.hosted.node.admin.util.PrefixLogger; import com.yahoo.vespa.hosted.provision.Node; import java.io.IOException; @@ -18,6 +18,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.logging.Logger; import java.util.stream.Collectors; import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED; @@ -39,7 +40,7 @@ public class NodeAdminStateUpdater extends AbstractComponent { private final Object monitor = new Object(); - private final PrefixLogger logger = PrefixLogger.getNodeAdminLogger(NodeAdminStateUpdater.class); + private final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName()); private Thread loopThread; private final NodeRepository nodeRepository; @@ -81,6 +82,7 @@ public class NodeAdminStateUpdater extends AbstractComponent { public boolean setResumeStateAndCheckIfResumed(State wantedState) { synchronized (monitor) { if (this.wantedState != wantedState) { + log.info("Wanted state change: " + this.wantedState + " -> " + wantedState); this.wantedState = wantedState; signalWorkToBeDone(); } @@ -107,7 +109,7 @@ public class NodeAdminStateUpdater extends AbstractComponent { try { monitor.wait(remainder); } catch (InterruptedException e) { - logger.error("Interrupted, but ignoring this: NodeAdminStateUpdater"); + log.info("Interrupted, but ignoring this: NodeAdminStateUpdater"); } } else break; } @@ -124,13 +126,10 @@ public class NodeAdminStateUpdater extends AbstractComponent { try { convergeState(wantedState); converged = true; - } catch (OrchestratorException e) { - logger.info("Orchestrator does not give permission to converge to " + wantedState - + ", will retry shortly: " + e.getMessage()); - } catch (ConvergenceException e) { - logger.info(e.getMessage()); + } catch (OrchestratorException | ConvergenceException e) { + log.info("Unable to converge to " + wantedState + ": " + e.getMessage()); } catch (Exception e) { - logger.error("Error while trying to converge to " + wantedState, e); + log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedState, e); } if (wantedState != RESUMED && !converged) { @@ -138,7 +137,7 @@ public class NodeAdminStateUpdater extends AbstractComponent { if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) { // We have spent too long time trying to freeze and node admin is still not frozen. // To avoid node agents stalling for too long, we'll force unfrozen ticks now. - logger.info("Timed out trying to freeze, will force unfreezed ticks"); + log.info("Timed out trying to freeze, will force unfreezed ticks"); nodeAdmin.setFrozen(false); } } @@ -153,7 +152,7 @@ public class NodeAdminStateUpdater extends AbstractComponent { private void convergeState(State wantedState) { boolean wantFrozen = wantedState != RESUMED; if (!nodeAdmin.setFrozen(wantFrozen)) { - throw new ConvergenceException("NodeAdmin has not yet converged to " + (wantFrozen ? "frozen" : "unfrozen")); + throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen")); } if (wantedState == RESUMED) { @@ -188,6 +187,8 @@ public class NodeAdminStateUpdater extends AbstractComponent { private State updateAndGetCurrentState(State currentState) { synchronized (monitor) { + log.info("State change: " + this.currentState + " -> " + currentState + + (currentState == wantedState ? " [converged]" : "")); this.currentState = currentState; return currentState; } @@ -198,24 +199,24 @@ public class NodeAdminStateUpdater extends AbstractComponent { // Refresh containers to run even if we would like to suspend but have failed to do so yet, // because it may take a long time to get permission to suspend. if (currentState != RESUMED) { - logger.info("Frozen, skipping fetching info from node repository"); + log.info("Frozen, skipping fetching info from node repository"); return; } final List<ContainerNodeSpec> containersToRun; try { containersToRun = nodeRepository.getContainersToRun(); - } catch (Throwable t) { - logger.warning("Failed fetching container info from node repository", t); + } catch (Exception e) { + log.log(LogLevel.WARNING, "Failed fetching container info from node repository", e); return; } if (containersToRun == null) { - logger.warning("Got null from node repository"); + log.warning("Got null from node repository"); return; } try { nodeAdmin.refreshContainersToRun(containersToRun); - } catch (Throwable t) { - logger.warning("Failed updating node admin: ", t); + } catch (Exception e) { + log.log(LogLevel.WARNING, "Failed updating node admin: ", e); } } } @@ -250,10 +251,10 @@ public class NodeAdminStateUpdater extends AbstractComponent { try { loopThread.join(10000); if (loopThread.isAlive()) { - logger.error("Could not stop NodeAdminStateUpdater tick thread"); + log.log(LogLevel.ERROR, "Could not stop tick thread"); } } catch (InterruptedException e1) { - logger.error("Interrupted; Could not stop NodeAdminStateUpdater thread"); + log.log(LogLevel.ERROR, "Interrupted; Could not stop thread"); } nodeAdmin.shutdown(); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/util/ConfigServerHttpRequestExecutor.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/util/ConfigServerHttpRequestExecutor.java index 2b4bf8b5750..4434213989f 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/util/ConfigServerHttpRequestExecutor.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/util/ConfigServerHttpRequestExecutor.java @@ -83,8 +83,14 @@ public class ConfigServerHttpRequestExecutor { try { response = client.execute(requestFactory.createRequest(configServer)); } catch (Exception e) { + // Failure to communicate with a config server is not abnormal, as they are + // upgraded at the same time as Docker hosts. + if (e.getMessage().indexOf("(Connection refused)") > 0) { + NODE_ADMIN_LOGGER.info("Connection refused to " + configServer + " (upgrading?), will try next"); + } else { + NODE_ADMIN_LOGGER.warning("Failed to communicate with " + configServer + ", will try next: " + e.getMessage()); + } lastException = e; - NODE_ADMIN_LOGGER.info("Exception while talking to " + configServer + " (will try all config servers):" + e.getMessage()); continue; } @@ -107,7 +113,9 @@ public class ConfigServerHttpRequestExecutor { } } } - throw new RuntimeException("Failed executing request, last exception: ", lastException); + + throw new RuntimeException("All requests against the config servers (" + + configServerHosts + ") failed, last as follows:", lastException); } public <T> T put(String path, int port, Optional<Object> bodyJsonPojo, Class<T> wantedReturnType) { |