X-Git-Url: https://feistymeow.org/gitweb/?a=blobdiff_plain;f=kona%2Fsrc%2Forg%2Fgffs%2Fnetwork%2FDeadHostChecker.java;fp=kona%2Fsrc%2Forg%2Fgffs%2Fnetwork%2FDeadHostChecker.java;h=4b4a833d71cb01202a06cbabc4c75d4316e53f79;hb=13679382916f379ef7fc0cf02693f0186a905835;hp=0000000000000000000000000000000000000000;hpb=909697fd6009240b5bf2643d54b129b06b733953;p=feisty_meow.git diff --git a/kona/src/org/gffs/network/DeadHostChecker.java b/kona/src/org/gffs/network/DeadHostChecker.java new file mode 100644 index 00000000..4b4a833d --- /dev/null +++ b/kona/src/org/gffs/network/DeadHostChecker.java @@ -0,0 +1,197 @@ +package org.gffs.network; + +import java.util.HashMap; + +import org.apache.commons.lang3.builder.HashCodeBuilder; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +//import edu.virginia.vcgr.genii.client.ClientProperties; + +/** + * Manages a list of hosts that have proven to be down recently. Has support to not immediately fail the host, as this can lead to too quick + * an assumption that the host is down, but after N tries, the host is out. It will then be tested again periodically so we know when it comes + * back online. + */ +public class DeadHostChecker +{ + static private Log _logger = LogFactory.getLog(DeadHostChecker.class); + + //need better substitute for cli props. + public static class ClientProperties { + int timeout = 2 * 60 * 1000; // 2 minutes timeout by default, in ms. + + ClientProperties() { + } + } + static public ClientProperties props; + + + // this value specifies how many attempts can fail before the host is considered down. + static private final int HOW_MANY_DOWNS_ALLOWED = 1; + + /* + * this is the longest amount of time between checking of dead hosts that we'll ever pause. exponential backoff will occur up until this + * delay time, and then stay at this delay time afterwards. + */ + static private final int MAXIMUM_ALLOWABLE_CHECKING_DELAY = 60 * 1000 * 5; // current is 5 minutes max for exponential backoff on retries. + + public static class HostKey + { + public String hostname; + public int port; + + HostKey(String hostname, int port) + { + this.hostname = hostname; + this.port = port; + } + + @Override + public int hashCode() + { + return new HashCodeBuilder(37, 839). // two randomly chosen prime numbers + // if deriving: appendSuper(super.hashCode()). + append(hostname).append(port).toHashCode(); + } + + @Override + public boolean equals(Object o) + { + if (!(o instanceof HostKey)) + return false; // wrong object. + HostKey realo = (HostKey) o; + return realo.hostname.equals(hostname) && (realo.port == port); + } + + @Override + public String toString() + { + return hostname + ":" + port; + } + } + + static final HashMap deadHosts = new HashMap(); + + public static class RetryInfo + { + public long nextTime; + public int delay; + public int downCount = 0; + + public RetryInfo() + { + // We just failed, so base a delay on the overall timeout to delay our next attempt. + delay = initialDelay(); + nextTime = System.currentTimeMillis() + delay; + } + + int initialDelay() + { + return props.timeout / 2; + } + + boolean isThisHostDead() + { + if (downCount < HOW_MANY_DOWNS_ALLOWED) { + return false; + } + if (System.currentTimeMillis() > nextTime) { + // this host is being allowed a retry. + nextTime = System.currentTimeMillis() + delay; + return false; + } + return true; + } + + void recordDown() + { + downCount++; + } + } + + /** + * checks the host in our records and returns true if it is considered alive and false if it is considered dead. + */ + public static boolean evaluateHostAlive(String host, int port) + { + HostKey key = new HostKey(host, port); + + // Added July 14, 2015 by ASG to deal with dead hosts and not bother trying to talk to them. The timeouts kill us. + synchronized (deadHosts) { + if (deadHosts.containsKey(host)) { + RetryInfo inf = deadHosts.get(key); + if (inf == null) { + _logger.warn("logic error: dead hosts list said it had host " + key + " was listed but we got a null record for it."); + return true; + } + return !inf.isThisHostDead(); + } else { + // up as far as we know; no record exists. + if (_logger.isTraceEnabled()) + _logger.debug("host " + key + " is fine as far as we know."); + return true; + } + } + } + + public static void addHostToDeadPool(String host, int port) + { + HostKey key = new HostKey(host, port); + + synchronized (deadHosts) { + RetryInfo inf = deadHosts.get(key); + if (inf == null) { + // Not there, set it up and add it. + inf = new RetryInfo(); + deadHosts.put(key, inf); + } + + boolean alreadyDead = false; + if (inf.isThisHostDead()) { + // this one is already down so expand the timeout. + if (_logger.isDebugEnabled()) + _logger.warn("host " + key + " is considered dead already; increasing delay."); + inf.delay *= 2; + inf.nextTime = System.currentTimeMillis() + inf.delay; + + if (inf.delay > MAXIMUM_ALLOWABLE_CHECKING_DELAY) { + inf.delay = MAXIMUM_ALLOWABLE_CHECKING_DELAY; + } + // flag this so we don't say something again below. + alreadyDead = true; + } + + // we definitely saw this host as down at least once, so record that now. + inf.recordDown(); + + if (!inf.isThisHostDead()) { + // still up, although we needed to record that failure. + if (_logger.isDebugEnabled()) + _logger.debug("host " + key + " is not dead yet but suffered a connection problem."); + } else { + // this is dead now. say something about it if we didn't already. + if (!alreadyDead && _logger.isDebugEnabled()) + _logger.warn("host " + key + " is newly considered dead due to communication problems."); + } + } + } + + public static void removeHostFromDeadPool(String host, int port) + { + HostKey key = new HostKey(host, port); + + // Well, the host was reported alive again, so remove if it is in deadHosts. + synchronized (deadHosts) { + if (deadHosts.containsKey(key)) { + if (_logger.isDebugEnabled()) { + // if it's not present, we don't say anything. + _logger.debug("host " + key + " is being removed from dead host pool."); + } + // drop it from the list. + deadHosts.remove(key); + } + } + } + +}