001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.ha;
020
021import java.io.IOException;
022import java.util.Arrays;
023import java.util.List;
024import java.util.concurrent.CountDownLatch;
025import java.util.concurrent.TimeUnit;
026import java.util.concurrent.locks.Lock;
027import java.util.concurrent.locks.ReentrantLock;
028
029import org.apache.commons.logging.Log;
030import org.apache.commons.logging.LogFactory;
031import org.apache.hadoop.HadoopIllegalArgumentException;
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.classification.InterfaceStability;
034import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
035import org.apache.hadoop.util.StringUtils;
036import org.apache.zookeeper.data.ACL;
037import org.apache.zookeeper.KeeperException;
038import org.apache.zookeeper.Watcher;
039import org.apache.zookeeper.WatchedEvent;
040import org.apache.zookeeper.Watcher.Event;
041import org.apache.zookeeper.ZKUtil;
042import org.apache.zookeeper.ZooKeeper;
043import org.apache.zookeeper.CreateMode;
044import org.apache.zookeeper.AsyncCallback.*;
045import org.apache.zookeeper.data.Stat;
046import org.apache.zookeeper.KeeperException.Code;
047
048import com.google.common.annotations.VisibleForTesting;
049import com.google.common.base.Preconditions;
050
051/**
052 * 
053 * This class implements a simple library to perform leader election on top of
054 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
055 * can be performed by atomically creating an ephemeral lock file (znode) on
056 * Zookeeper. The service instance that successfully creates the znode becomes
057 * active and the rest become standbys. <br/>
058 * This election mechanism is only efficient for small number of election
059 * candidates (order of 10's) because contention on single znode by a large
060 * number of candidates can result in Zookeeper overload. <br/>
061 * The elector does not guarantee fencing (protection of shared resources) among
062 * service instances. After it has notified an instance about becoming a leader,
063 * then that instance must ensure that it meets the service consistency
064 * requirements. If it cannot do so, then it is recommended to quit the
065 * election. The application implements the {@link ActiveStandbyElectorCallback}
066 * to interact with the elector
067 */
068@InterfaceAudience.Private
069@InterfaceStability.Evolving
070public class ActiveStandbyElector implements StatCallback, StringCallback {
071
072  /**
073   * Callback interface to interact with the ActiveStandbyElector object. <br/>
074   * The application will be notified with a callback only on state changes
075   * (i.e. there will never be successive calls to becomeActive without an
076   * intermediate call to enterNeutralMode). <br/>
077   * The callbacks will be running on Zookeeper client library threads. The
078   * application should return from these callbacks quickly so as not to impede
079   * Zookeeper client library performance and notifications. The app will
080   * typically remember the state change and return from the callback. It will
081   * then proceed with implementing actions around that state change. It is
082   * possible to be called back again while these actions are in flight and the
083   * app should handle this scenario.
084   */
085  public interface ActiveStandbyElectorCallback {
086    /**
087     * This method is called when the app becomes the active leader.
088     * If the service fails to become active, it should throw
089     * ServiceFailedException. This will cause the elector to
090     * sleep for a short period, then re-join the election.
091     * 
092     * Callback implementations are expected to manage their own
093     * timeouts (e.g. when making an RPC to a remote node).
094     */
095    void becomeActive() throws ServiceFailedException;
096
097    /**
098     * This method is called when the app becomes a standby
099     */
100    void becomeStandby();
101
102    /**
103     * If the elector gets disconnected from Zookeeper and does not know about
104     * the lock state, then it will notify the service via the enterNeutralMode
105     * interface. The service may choose to ignore this or stop doing state
106     * changing operations. Upon reconnection, the elector verifies the leader
107     * status and calls back on the becomeActive and becomeStandby app
108     * interfaces. <br/>
109     * Zookeeper disconnects can happen due to network issues or loss of
110     * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
111     * split-brain issues. In such situations it might be prudent to call
112     * becomeStandby too. However, such state change operations might be
113     * expensive and enterNeutralMode can help guard against doing that for
114     * transient issues.
115     */
116    void enterNeutralMode();
117
118    /**
119     * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
120     * errors or Zookeeper persistent unavailability) then notifyFatalError is
121     * called to notify the app about it.
122     */
123    void notifyFatalError(String errorMessage);
124
125    /**
126     * If an old active has failed, rather than exited gracefully, then
127     * the new active may need to take some fencing actions against it
128     * before proceeding with failover.
129     * 
130     * @param oldActiveData the application data provided by the prior active
131     */
132    void fenceOldActive(byte[] oldActiveData);
133  }
134
135  /**
136   * Name of the lock znode used by the library. Protected for access in test
137   * classes
138   */
139  @VisibleForTesting
140  protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock";
141  @VisibleForTesting
142  protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb";
143
144  public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
145
146  private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000;
147
148  private static enum ConnectionState {
149    DISCONNECTED, CONNECTED, TERMINATED
150  };
151
152  static enum State {
153    INIT, ACTIVE, STANDBY, NEUTRAL
154  };
155
156  private State state = State.INIT;
157  private int createRetryCount = 0;
158  private int statRetryCount = 0;
159  private ZooKeeper zkClient;
160  private WatcherWithClientRef watcher;
161  private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
162
163  private final ActiveStandbyElectorCallback appClient;
164  private final String zkHostPort;
165  private final int zkSessionTimeout;
166  private final List<ACL> zkAcl;
167  private final List<ZKAuthInfo> zkAuthInfo;
168  private byte[] appData;
169  private final String zkLockFilePath;
170  private final String zkBreadCrumbPath;
171  private final String znodeWorkingDir;
172  private final int maxRetryNum;
173
174  private Lock sessionReestablishLockForTests = new ReentrantLock();
175  private boolean wantToBeInElection;
176  private boolean monitorLockNodePending = false;
177  private ZooKeeper monitorLockNodeClient;
178
179  /**
180   * Create a new ActiveStandbyElector object <br/>
181   * The elector is created by providing to it the Zookeeper configuration, the
182   * parent znode under which to create the znode and a reference to the
183   * callback interface. <br/>
184   * The parent znode name must be the same for all service instances and
185   * different across services. <br/>
186   * After the leader has been lost, a new leader will be elected after the
187   * session timeout expires. Hence, the app must set this parameter based on
188   * its needs for failure response time. The session timeout must be greater
189   * than the Zookeeper disconnect timeout and is recommended to be 3X that
190   * value to enable Zookeeper to retry transient disconnections. Setting a very
191   * short session timeout may result in frequent transitions between active and
192   * standby states during issues like network outages/GS pauses.
193   * 
194   * @param zookeeperHostPorts
195   *          ZooKeeper hostPort for all ZooKeeper servers
196   * @param zookeeperSessionTimeout
197   *          ZooKeeper session timeout
198   * @param parentZnodeName
199   *          znode under which to create the lock
200   * @param acl
201   *          ZooKeeper ACL's
202   * @param authInfo a list of authentication credentials to add to the
203   *                 ZK connection
204   * @param app
205   *          reference to callback interface object
206   * @throws IOException
207   * @throws HadoopIllegalArgumentException
208   */
209  public ActiveStandbyElector(String zookeeperHostPorts,
210      int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
211      List<ZKAuthInfo> authInfo,
212      ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
213      HadoopIllegalArgumentException, KeeperException {
214    if (app == null || acl == null || parentZnodeName == null
215        || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
216      throw new HadoopIllegalArgumentException("Invalid argument");
217    }
218    zkHostPort = zookeeperHostPorts;
219    zkSessionTimeout = zookeeperSessionTimeout;
220    zkAcl = acl;
221    zkAuthInfo = authInfo;
222    appClient = app;
223    znodeWorkingDir = parentZnodeName;
224    zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
225    zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
226    this.maxRetryNum = maxRetryNum;
227
228    // createConnection for future API calls
229    createConnection();
230  }
231
232  /**
233   * To participate in election, the app will call joinElection. The result will
234   * be notified by a callback on either the becomeActive or becomeStandby app
235   * interfaces. <br/>
236   * After this the elector will automatically monitor the leader status and
237   * perform re-election if necessary<br/>
238   * The app could potentially start off in standby mode and ignore the
239   * becomeStandby call.
240   * 
241   * @param data
242   *          to be set by the app. non-null data must be set.
243   * @throws HadoopIllegalArgumentException
244   *           if valid data is not supplied
245   */
246  public synchronized void joinElection(byte[] data)
247      throws HadoopIllegalArgumentException {
248    
249    if (data == null) {
250      throw new HadoopIllegalArgumentException("data cannot be null");
251    }
252    
253    if (wantToBeInElection) {
254      LOG.info("Already in election. Not re-connecting.");
255      return;
256    }
257
258    appData = new byte[data.length];
259    System.arraycopy(data, 0, appData, 0, data.length);
260
261    LOG.debug("Attempting active election for " + this);
262    joinElectionInternal();
263  }
264  
265  /**
266   * @return true if the configured parent znode exists
267   */
268  public synchronized boolean parentZNodeExists()
269      throws IOException, InterruptedException {
270    Preconditions.checkState(zkClient != null);
271    try {
272      return zkClient.exists(znodeWorkingDir, false) != null;
273    } catch (KeeperException e) {
274      throw new IOException("Couldn't determine existence of znode '" +
275          znodeWorkingDir + "'", e);
276    }
277  }
278
279  /**
280   * Utility function to ensure that the configured base znode exists.
281   * This recursively creates the znode as well as all of its parents.
282   */
283  public synchronized void ensureParentZNode()
284      throws IOException, InterruptedException {
285    Preconditions.checkState(!wantToBeInElection,
286        "ensureParentZNode() may not be called while in the election");
287
288    String pathParts[] = znodeWorkingDir.split("/");
289    Preconditions.checkArgument(pathParts.length >= 1 &&
290        pathParts[0].isEmpty(),
291        "Invalid path: %s", znodeWorkingDir);
292    
293    StringBuilder sb = new StringBuilder();
294    for (int i = 1; i < pathParts.length; i++) {
295      sb.append("/").append(pathParts[i]);
296      String prefixPath = sb.toString();
297      LOG.debug("Ensuring existence of " + prefixPath);
298      try {
299        createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT);
300      } catch (KeeperException e) {
301        if (isNodeExists(e.code())) {
302          // This is OK - just ensuring existence.
303          continue;
304        } else {
305          throw new IOException("Couldn't create " + prefixPath, e);
306        }
307      }
308    }
309    
310    LOG.info("Successfully created " + znodeWorkingDir + " in ZK.");
311  }
312  
313  /**
314   * Clear all of the state held within the parent ZNode.
315   * This recursively deletes everything within the znode as well as the
316   * parent znode itself. It should only be used when it's certain that
317   * no electors are currently participating in the election.
318   */
319  public synchronized void clearParentZNode()
320      throws IOException, InterruptedException {
321    Preconditions.checkState(!wantToBeInElection,
322        "clearParentZNode() may not be called while in the election");
323
324    try {
325      LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
326
327      zkDoWithRetries(new ZKAction<Void>() {
328        @Override
329        public Void run() throws KeeperException, InterruptedException {
330          ZKUtil.deleteRecursive(zkClient, znodeWorkingDir);
331          return null;
332        }
333      });
334    } catch (KeeperException e) {
335      throw new IOException("Couldn't clear parent znode " + znodeWorkingDir,
336          e);
337    }
338    LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK.");
339  }
340
341
342  /**
343   * Any service instance can drop out of the election by calling quitElection. 
344   * <br/>
345   * This will lose any leader status, if held, and stop monitoring of the lock
346   * node. <br/>
347   * If the instance wants to participate in election again, then it needs to
348   * call joinElection(). <br/>
349   * This allows service instances to take themselves out of rotation for known
350   * impending unavailable states (e.g. long GC pause or software upgrade).
351   * 
352   * @param needFence true if the underlying daemon may need to be fenced
353   * if a failover occurs due to dropping out of the election.
354   */
355  public synchronized void quitElection(boolean needFence) {
356    LOG.info("Yielding from election");
357    if (!needFence && state == State.ACTIVE) {
358      // If active is gracefully going back to standby mode, remove
359      // our permanent znode so no one fences us.
360      tryDeleteOwnBreadCrumbNode();
361    }
362    reset();
363    wantToBeInElection = false;
364  }
365
366  /**
367   * Exception thrown when there is no active leader
368   */
369  public static class ActiveNotFoundException extends Exception {
370    private static final long serialVersionUID = 3505396722342846462L;
371  }
372
373  /**
374   * get data set by the active leader
375   * 
376   * @return data set by the active instance
377   * @throws ActiveNotFoundException
378   *           when there is no active leader
379   * @throws KeeperException
380   *           other zookeeper operation errors
381   * @throws InterruptedException
382   * @throws IOException
383   *           when ZooKeeper connection could not be established
384   */
385  public synchronized byte[] getActiveData() throws ActiveNotFoundException,
386      KeeperException, InterruptedException, IOException {
387    try {
388      if (zkClient == null) {
389        createConnection();
390      }
391      Stat stat = new Stat();
392      return getDataWithRetries(zkLockFilePath, false, stat);
393    } catch(KeeperException e) {
394      Code code = e.code();
395      if (isNodeDoesNotExist(code)) {
396        // handle the commonly expected cases that make sense for us
397        throw new ActiveNotFoundException();
398      } else {
399        throw e;
400      }
401    }
402  }
403
404  /**
405   * interface implementation of Zookeeper callback for create
406   */
407  @Override
408  public synchronized void processResult(int rc, String path, Object ctx,
409      String name) {
410    if (isStaleClient(ctx)) return;
411    LOG.debug("CreateNode result: " + rc + " for path: " + path
412        + " connectionState: " + zkConnectionState +
413        "  for " + this);
414
415    Code code = Code.get(rc);
416    if (isSuccess(code)) {
417      // we successfully created the znode. we are the leader. start monitoring
418      if (becomeActive()) {
419        monitorActiveStatus();
420      } else {
421        reJoinElectionAfterFailureToBecomeActive();
422      }
423      return;
424    }
425
426    if (isNodeExists(code)) {
427      if (createRetryCount == 0) {
428        // znode exists and we did not retry the operation. so a different
429        // instance has created it. become standby and monitor lock.
430        becomeStandby();
431      }
432      // if we had retried then the znode could have been created by our first
433      // attempt to the server (that we lost) and this node exists response is
434      // for the second attempt. verify this case via ephemeral node owner. this
435      // will happen on the callback for monitoring the lock.
436      monitorActiveStatus();
437      return;
438    }
439
440    String errorMessage = "Received create error from Zookeeper. code:"
441        + code.toString() + " for path " + path;
442    LOG.debug(errorMessage);
443
444    if (shouldRetry(code)) {
445      if (createRetryCount < maxRetryNum) {
446        LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
447        ++createRetryCount;
448        createLockNodeAsync();
449        return;
450      }
451      errorMessage = errorMessage
452          + ". Not retrying further znode create connection errors.";
453    } else if (isSessionExpired(code)) {
454      // This isn't fatal - the client Watcher will re-join the election
455      LOG.warn("Lock acquisition failed because session was lost");
456      return;
457    }
458
459    fatalError(errorMessage);
460  }
461
462  /**
463   * interface implementation of Zookeeper callback for monitor (exists)
464   */
465  @Override
466  public synchronized void processResult(int rc, String path, Object ctx,
467      Stat stat) {
468    if (isStaleClient(ctx)) return;
469    monitorLockNodePending = false;
470
471    assert wantToBeInElection :
472        "Got a StatNode result after quitting election";
473    
474    LOG.debug("StatNode result: " + rc + " for path: " + path
475        + " connectionState: " + zkConnectionState + " for " + this);
476        
477
478    Code code = Code.get(rc);
479    if (isSuccess(code)) {
480      // the following owner check completes verification in case the lock znode
481      // creation was retried
482      if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
483        // we own the lock znode. so we are the leader
484        if (!becomeActive()) {
485          reJoinElectionAfterFailureToBecomeActive();
486        }
487      } else {
488        // we dont own the lock znode. so we are a standby.
489        becomeStandby();
490      }
491      // the watch set by us will notify about changes
492      return;
493    }
494
495    if (isNodeDoesNotExist(code)) {
496      // the lock znode disappeared before we started monitoring it
497      enterNeutralMode();
498      joinElectionInternal();
499      return;
500    }
501
502    String errorMessage = "Received stat error from Zookeeper. code:"
503        + code.toString();
504    LOG.debug(errorMessage);
505
506    if (shouldRetry(code)) {
507      if (statRetryCount < maxRetryNum) {
508        ++statRetryCount;
509        monitorLockNodeAsync();
510        return;
511      }
512      errorMessage = errorMessage
513          + ". Not retrying further znode monitoring connection errors.";
514    } else if (isSessionExpired(code)) {
515      // This isn't fatal - the client Watcher will re-join the election
516      LOG.warn("Lock monitoring failed because session was lost");
517      return;
518    }
519
520    fatalError(errorMessage);
521  }
522
523  /**
524   * We failed to become active. Re-join the election, but
525   * sleep for a few seconds after terminating our existing
526   * session, so that other nodes have a chance to become active.
527   * The failure to become active is already logged inside
528   * becomeActive().
529   */
530  private void reJoinElectionAfterFailureToBecomeActive() {
531    reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE);
532  }
533
534  /**
535   * interface implementation of Zookeeper watch events (connection and node),
536   * proxied by {@link WatcherWithClientRef}.
537   */
538  synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
539    Event.EventType eventType = event.getType();
540    if (isStaleClient(zk)) return;
541    LOG.debug("Watcher event type: " + eventType + " with state:"
542        + event.getState() + " for path:" + event.getPath()
543        + " connectionState: " + zkConnectionState
544        + " for " + this);
545
546    if (eventType == Event.EventType.None) {
547      // the connection state has changed
548      switch (event.getState()) {
549      case SyncConnected:
550        LOG.info("Session connected.");
551        // if the listener was asked to move to safe state then it needs to
552        // be undone
553        ConnectionState prevConnectionState = zkConnectionState;
554        zkConnectionState = ConnectionState.CONNECTED;
555        if (prevConnectionState == ConnectionState.DISCONNECTED &&
556            wantToBeInElection) {
557          monitorActiveStatus();
558        }
559        break;
560      case Disconnected:
561        LOG.info("Session disconnected. Entering neutral mode...");
562
563        // ask the app to move to safe state because zookeeper connection
564        // is not active and we dont know our state
565        zkConnectionState = ConnectionState.DISCONNECTED;
566        enterNeutralMode();
567        break;
568      case Expired:
569        // the connection got terminated because of session timeout
570        // call listener to reconnect
571        LOG.info("Session expired. Entering neutral mode and rejoining...");
572        enterNeutralMode();
573        reJoinElection(0);
574        break;
575      case SaslAuthenticated:
576        LOG.info("Successfully authenticated to ZooKeeper using SASL.");
577        break;
578      default:
579        fatalError("Unexpected Zookeeper watch event state: "
580            + event.getState());
581        break;
582      }
583
584      return;
585    }
586
587    // a watch on lock path in zookeeper has fired. so something has changed on
588    // the lock. ideally we should check that the path is the same as the lock
589    // path but trusting zookeeper for now
590    String path = event.getPath();
591    if (path != null) {
592      switch (eventType) {
593      case NodeDeleted:
594        if (state == State.ACTIVE) {
595          enterNeutralMode();
596        }
597        joinElectionInternal();
598        break;
599      case NodeDataChanged:
600        monitorActiveStatus();
601        break;
602      default:
603        LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
604        monitorActiveStatus();
605      }
606
607      return;
608    }
609
610    // some unexpected error has occurred
611    fatalError("Unexpected watch error from Zookeeper");
612  }
613
614  /**
615   * Get a new zookeeper client instance. protected so that test class can
616   * inherit and pass in a mock object for zookeeper
617   * 
618   * @return new zookeeper client instance
619   * @throws IOException
620   * @throws KeeperException zookeeper connectionloss exception
621   */
622  protected synchronized ZooKeeper getNewZooKeeper() throws IOException,
623      KeeperException {
624    
625    // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and
626    // may trigger the Connected event immediately. So, if we register the
627    // watcher after constructing ZooKeeper, we may miss that event. Instead,
628    // we construct the watcher first, and have it block any events it receives
629    // before we can set its ZooKeeper reference.
630    watcher = new WatcherWithClientRef();
631    ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
632    watcher.setZooKeeperRef(zk);
633
634    // Wait for the asynchronous success/failure. This may throw an exception
635    // if we don't connect within the session timeout.
636    watcher.waitForZKConnectionEvent(zkSessionTimeout);
637    
638    for (ZKAuthInfo auth : zkAuthInfo) {
639      zk.addAuthInfo(auth.getScheme(), auth.getAuth());
640    }
641    return zk;
642  }
643
644  private void fatalError(String errorMessage) {
645    LOG.fatal(errorMessage);
646    reset();
647    appClient.notifyFatalError(errorMessage);
648  }
649
650  private void monitorActiveStatus() {
651    assert wantToBeInElection;
652    LOG.debug("Monitoring active leader for " + this);
653    statRetryCount = 0;
654    monitorLockNodeAsync();
655  }
656
657  private void joinElectionInternal() {
658    Preconditions.checkState(appData != null,
659        "trying to join election without any app data");
660    if (zkClient == null) {
661      if (!reEstablishSession()) {
662        fatalError("Failed to reEstablish connection with ZooKeeper");
663        return;
664      }
665    }
666
667    createRetryCount = 0;
668    wantToBeInElection = true;
669    createLockNodeAsync();
670  }
671
672  private void reJoinElection(int sleepTime) {
673    LOG.info("Trying to re-establish ZK session");
674    
675    // Some of the test cases rely on expiring the ZK sessions and
676    // ensuring that the other node takes over. But, there's a race
677    // where the original lease holder could reconnect faster than the other
678    // thread manages to take the lock itself. This lock allows the
679    // tests to block the reconnection. It's a shame that this leaked
680    // into non-test code, but the lock is only acquired here so will never
681    // be contended.
682    sessionReestablishLockForTests.lock();
683    try {
684      terminateConnection();
685      sleepFor(sleepTime);
686      // Should not join election even before the SERVICE is reported
687      // as HEALTHY from ZKFC monitoring.
688      if (appData != null) {
689        joinElectionInternal();
690      } else {
691        LOG.info("Not joining election since service has not yet been " +
692            "reported as healthy.");
693      }
694    } finally {
695      sessionReestablishLockForTests.unlock();
696    }
697  }
698
699  /**
700   * Sleep for the given number of milliseconds.
701   * This is non-static, and separated out, so that unit tests
702   * can override the behavior not to sleep.
703   */
704  @VisibleForTesting
705  protected void sleepFor(int sleepMs) {
706    if (sleepMs > 0) {
707      try {
708        Thread.sleep(sleepMs);
709      } catch (InterruptedException e) {
710        Thread.currentThread().interrupt();
711      }
712    }
713  }
714
715  @VisibleForTesting
716  void preventSessionReestablishmentForTests() {
717    sessionReestablishLockForTests.lock();
718  }
719  
720  @VisibleForTesting
721  void allowSessionReestablishmentForTests() {
722    sessionReestablishLockForTests.unlock();
723  }
724  
725  @VisibleForTesting
726  synchronized long getZKSessionIdForTests() {
727    if (zkClient != null) {
728      return zkClient.getSessionId();
729    } else {
730      return -1;
731    }
732  }
733  
734  @VisibleForTesting
735  synchronized State getStateForTests() {
736    return state;
737  }
738
739  @VisibleForTesting
740  synchronized boolean isMonitorLockNodePending() {
741    return monitorLockNodePending;
742  }
743
744  private boolean reEstablishSession() {
745    int connectionRetryCount = 0;
746    boolean success = false;
747    while(!success && connectionRetryCount < maxRetryNum) {
748      LOG.debug("Establishing zookeeper connection for " + this);
749      try {
750        createConnection();
751        success = true;
752      } catch(IOException e) {
753        LOG.warn(e);
754        sleepFor(5000);
755      } catch(KeeperException e) {
756        LOG.warn(e);
757        sleepFor(5000);
758      }
759      ++connectionRetryCount;
760    }
761    return success;
762  }
763
764  private void createConnection() throws IOException, KeeperException {
765    if (zkClient != null) {
766      try {
767        zkClient.close();
768      } catch (InterruptedException e) {
769        throw new IOException("Interrupted while closing ZK",
770            e);
771      }
772      zkClient = null;
773      watcher = null;
774    }
775    zkClient = getNewZooKeeper();
776    LOG.debug("Created new connection for " + this);
777  }
778
779  @InterfaceAudience.Private
780  public synchronized void terminateConnection() {
781    if (zkClient == null) {
782      return;
783    }
784    LOG.debug("Terminating ZK connection for " + this);
785    ZooKeeper tempZk = zkClient;
786    zkClient = null;
787    watcher = null;
788    try {
789      tempZk.close();
790    } catch(InterruptedException e) {
791      LOG.warn(e);
792    }
793    zkConnectionState = ConnectionState.TERMINATED;
794    wantToBeInElection = false;
795  }
796
797  private void reset() {
798    state = State.INIT;
799    terminateConnection();
800  }
801
802  private boolean becomeActive() {
803    assert wantToBeInElection;
804    if (state == State.ACTIVE) {
805      // already active
806      return true;
807    }
808    try {
809      Stat oldBreadcrumbStat = fenceOldActive();
810      writeBreadCrumbNode(oldBreadcrumbStat);
811      
812      LOG.debug("Becoming active for " + this);
813      appClient.becomeActive();
814      state = State.ACTIVE;
815      return true;
816    } catch (Exception e) {
817      LOG.warn("Exception handling the winning of election", e);
818      // Caller will handle quitting and rejoining the election.
819      return false;
820    }
821  }
822
823  /**
824   * Write the "ActiveBreadCrumb" node, indicating that this node may need
825   * to be fenced on failover.
826   * @param oldBreadcrumbStat 
827   */
828  private void writeBreadCrumbNode(Stat oldBreadcrumbStat)
829      throws KeeperException, InterruptedException {
830    Preconditions.checkState(appData != null, "no appdata");
831    
832    LOG.info("Writing znode " + zkBreadCrumbPath +
833        " to indicate that the local node is the most recent active...");
834    if (oldBreadcrumbStat == null) {
835      // No previous active, just create the node
836      createWithRetries(zkBreadCrumbPath, appData, zkAcl,
837        CreateMode.PERSISTENT);
838    } else {
839      // There was a previous active, update the node
840      setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
841    }
842  }
843  
844  /**
845   * Try to delete the "ActiveBreadCrumb" node when gracefully giving up
846   * active status.
847   * If this fails, it will simply warn, since the graceful release behavior
848   * is only an optimization.
849   */
850  private void tryDeleteOwnBreadCrumbNode() {
851    assert state == State.ACTIVE;
852    LOG.info("Deleting bread-crumb of active node...");
853    
854    // Sanity check the data. This shouldn't be strictly necessary,
855    // but better to play it safe.
856    Stat stat = new Stat();
857    byte[] data = null;
858    try {
859      data = zkClient.getData(zkBreadCrumbPath, false, stat);
860
861      if (!Arrays.equals(data, appData)) {
862        throw new IllegalStateException(
863            "We thought we were active, but in fact " +
864            "the active znode had the wrong data: " +
865            StringUtils.byteToHexString(data) + " (stat=" + stat + ")");
866      }
867      
868      deleteWithRetries(zkBreadCrumbPath, stat.getVersion());
869    } catch (Exception e) {
870      LOG.warn("Unable to delete our own bread-crumb of being active at " +
871          zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " +
872          "Expecting to be fenced by the next active.");
873    }
874  }
875
876  /**
877   * If there is a breadcrumb node indicating that another node may need
878   * fencing, try to fence that node.
879   * @return the Stat of the breadcrumb node that was read, or null
880   * if no breadcrumb node existed
881   */
882  private Stat fenceOldActive() throws InterruptedException, KeeperException {
883    final Stat stat = new Stat();
884    byte[] data;
885    LOG.info("Checking for any old active which needs to be fenced...");
886    try {
887      data = zkDoWithRetries(new ZKAction<byte[]>() {
888        @Override
889        public byte[] run() throws KeeperException, InterruptedException {
890          return zkClient.getData(zkBreadCrumbPath, false, stat);
891        }
892      });
893    } catch (KeeperException ke) {
894      if (isNodeDoesNotExist(ke.code())) {
895        LOG.info("No old node to fence");
896        return null;
897      }
898      
899      // If we failed to read for any other reason, then likely we lost
900      // our session, or we don't have permissions, etc. In any case,
901      // we probably shouldn't become active, and failing the whole
902      // thing is the best bet.
903      throw ke;
904    }
905
906    LOG.info("Old node exists: " + StringUtils.byteToHexString(data));
907    if (Arrays.equals(data, appData)) {
908      LOG.info("But old node has our own data, so don't need to fence it.");
909    } else {
910      appClient.fenceOldActive(data);
911    }
912    return stat;
913  }
914
915  private void becomeStandby() {
916    if (state != State.STANDBY) {
917      LOG.debug("Becoming standby for " + this);
918      state = State.STANDBY;
919      appClient.becomeStandby();
920    }
921  }
922
923  private void enterNeutralMode() {
924    if (state != State.NEUTRAL) {
925      LOG.debug("Entering neutral mode for " + this);
926      state = State.NEUTRAL;
927      appClient.enterNeutralMode();
928    }
929  }
930
931  private void createLockNodeAsync() {
932    zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL,
933        this, zkClient);
934  }
935
936  private void monitorLockNodeAsync() {
937    if (monitorLockNodePending && monitorLockNodeClient == zkClient) {
938      LOG.info("Ignore duplicate monitor lock-node request.");
939      return;
940    }
941    monitorLockNodePending = true;
942    monitorLockNodeClient = zkClient;
943    zkClient.exists(zkLockFilePath,
944        watcher, this,
945        zkClient);
946  }
947
948  private String createWithRetries(final String path, final byte[] data,
949      final List<ACL> acl, final CreateMode mode)
950      throws InterruptedException, KeeperException {
951    return zkDoWithRetries(new ZKAction<String>() {
952      @Override
953      public String run() throws KeeperException, InterruptedException {
954        return zkClient.create(path, data, acl, mode);
955      }
956    });
957  }
958
959  private byte[] getDataWithRetries(final String path, final boolean watch,
960      final Stat stat) throws InterruptedException, KeeperException {
961    return zkDoWithRetries(new ZKAction<byte[]>() {
962      @Override
963      public byte[] run() throws KeeperException, InterruptedException {
964        return zkClient.getData(path, watch, stat);
965      }
966    });
967  }
968
969  private Stat setDataWithRetries(final String path, final byte[] data,
970      final int version) throws InterruptedException, KeeperException {
971    return zkDoWithRetries(new ZKAction<Stat>() {
972      @Override
973      public Stat run() throws KeeperException, InterruptedException {
974        return zkClient.setData(path, data, version);
975      }
976    });
977  }
978  
979  private void deleteWithRetries(final String path, final int version)
980      throws KeeperException, InterruptedException {
981    zkDoWithRetries(new ZKAction<Void>() {
982      @Override
983      public Void run() throws KeeperException, InterruptedException {
984        zkClient.delete(path, version);
985        return null;
986      }
987    });
988  }
989
990  private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException,
991      InterruptedException {
992    int retry = 0;
993    while (true) {
994      try {
995        return action.run();
996      } catch (KeeperException ke) {
997        if (shouldRetry(ke.code()) && ++retry < maxRetryNum) {
998          continue;
999        }
1000        throw ke;
1001      }
1002    }
1003  }
1004
1005  private interface ZKAction<T> {
1006    T run() throws KeeperException, InterruptedException; 
1007  }
1008  
1009  /**
1010   * The callbacks and watchers pass a reference to the ZK client
1011   * which made the original call. We don't want to take action
1012   * based on any callbacks from prior clients after we quit
1013   * the election.
1014   * @param ctx the ZK client passed into the watcher
1015   * @return true if it matches the current client
1016   */
1017  private synchronized boolean isStaleClient(Object ctx) {
1018    Preconditions.checkNotNull(ctx);
1019    if (zkClient != (ZooKeeper)ctx) {
1020      LOG.warn("Ignoring stale result from old client with sessionId " +
1021          String.format("0x%08x", ((ZooKeeper)ctx).getSessionId()));
1022      return true;
1023    }
1024    return false;
1025  }
1026
1027  /**
1028   * Watcher implementation which keeps a reference around to the
1029   * original ZK connection, and passes it back along with any
1030   * events.
1031   */
1032  private final class WatcherWithClientRef implements Watcher {
1033    private ZooKeeper zk;
1034    
1035    /**
1036     * Latch fired whenever any event arrives. This is used in order
1037     * to wait for the Connected event when the client is first created.
1038     */
1039    private CountDownLatch hasReceivedEvent = new CountDownLatch(1);
1040
1041    /**
1042     * Latch used to wait until the reference to ZooKeeper is set.
1043     */
1044    private CountDownLatch hasSetZooKeeper = new CountDownLatch(1);
1045
1046    /**
1047     * Waits for the next event from ZooKeeper to arrive.
1048     * 
1049     * @param connectionTimeoutMs zookeeper connection timeout in milliseconds
1050     * @throws KeeperException if the connection attempt times out. This will
1051     * be a ZooKeeper ConnectionLoss exception code.
1052     * @throws IOException if interrupted while connecting to ZooKeeper
1053     */
1054    private void waitForZKConnectionEvent(int connectionTimeoutMs)
1055        throws KeeperException, IOException {
1056      try {
1057        if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) {
1058          LOG.error("Connection timed out: couldn't connect to ZooKeeper in "
1059              + connectionTimeoutMs + " milliseconds");
1060          zk.close();
1061          throw KeeperException.create(Code.CONNECTIONLOSS);
1062        }
1063      } catch (InterruptedException e) {
1064        Thread.currentThread().interrupt();
1065        throw new IOException(
1066            "Interrupted when connecting to zookeeper server", e);
1067      }
1068    }
1069
1070    private void setZooKeeperRef(ZooKeeper zk) {
1071      Preconditions.checkState(this.zk == null,
1072          "zk already set -- must be set exactly once");
1073      this.zk = zk;
1074      hasSetZooKeeper.countDown();
1075    }
1076
1077    @Override
1078    public void process(WatchedEvent event) {
1079      hasReceivedEvent.countDown();
1080      try {
1081        if (!hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS)) {
1082          LOG.debug("Event received with stale zk");
1083        }
1084        ActiveStandbyElector.this.processWatchEvent(
1085            zk, event);
1086      } catch (Throwable t) {
1087        fatalError(
1088            "Failed to process watcher event " + event + ": " +
1089            StringUtils.stringifyException(t));
1090      }
1091    }
1092  }
1093
1094  private static boolean isSuccess(Code code) {
1095    return (code == Code.OK);
1096  }
1097
1098  private static boolean isNodeExists(Code code) {
1099    return (code == Code.NODEEXISTS);
1100  }
1101
1102  private static boolean isNodeDoesNotExist(Code code) {
1103    return (code == Code.NONODE);
1104  }
1105  
1106  private static boolean isSessionExpired(Code code) {
1107    return (code == Code.SESSIONEXPIRED);
1108  }
1109
1110  private static boolean shouldRetry(Code code) {
1111    return code == Code.CONNECTIONLOSS || code == Code.OPERATIONTIMEOUT;
1112  }
1113  
1114  @Override
1115  public String toString() {
1116    return "elector id=" + System.identityHashCode(this) +
1117      " appData=" +
1118      ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 
1119      " cb=" + appClient;
1120  }
1121
1122  public String getHAZookeeperConnectionState() {
1123    return this.zkConnectionState.name();
1124  }
1125}