001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.ha; 020 021 import java.io.IOException; 022 import java.util.Arrays; 023 import java.util.List; 024 import java.util.concurrent.CountDownLatch; 025 import java.util.concurrent.TimeUnit; 026 import java.util.concurrent.locks.Lock; 027 import java.util.concurrent.locks.ReentrantLock; 028 029 import org.apache.commons.logging.Log; 030 import org.apache.commons.logging.LogFactory; 031 import org.apache.hadoop.HadoopIllegalArgumentException; 032 import org.apache.hadoop.classification.InterfaceAudience; 033 import org.apache.hadoop.classification.InterfaceStability; 034 import org.apache.hadoop.ha.HAZKUtil.ZKAuthInfo; 035 import org.apache.hadoop.util.StringUtils; 036 import org.apache.zookeeper.data.ACL; 037 import org.apache.zookeeper.KeeperException; 038 import org.apache.zookeeper.Watcher; 039 import org.apache.zookeeper.WatchedEvent; 040 import org.apache.zookeeper.Watcher.Event; 041 import org.apache.zookeeper.ZKUtil; 042 import org.apache.zookeeper.ZooKeeper; 043 import org.apache.zookeeper.CreateMode; 044 import org.apache.zookeeper.AsyncCallback.*; 045 import org.apache.zookeeper.data.Stat; 046 import org.apache.zookeeper.KeeperException.Code; 047 048 import com.google.common.annotations.VisibleForTesting; 049 import com.google.common.base.Preconditions; 050 import com.google.common.collect.Lists; 051 052 /** 053 * 054 * This class implements a simple library to perform leader election on top of 055 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election 056 * can be performed by atomically creating an ephemeral lock file (znode) on 057 * Zookeeper. The service instance that successfully creates the znode becomes 058 * active and the rest become standbys. <br/> 059 * This election mechanism is only efficient for small number of election 060 * candidates (order of 10's) because contention on single znode by a large 061 * number of candidates can result in Zookeeper overload. <br/> 062 * The elector does not guarantee fencing (protection of shared resources) among 063 * service instances. After it has notified an instance about becoming a leader, 064 * then that instance must ensure that it meets the service consistency 065 * requirements. If it cannot do so, then it is recommended to quit the 066 * election. The application implements the {@link ActiveStandbyElectorCallback} 067 * to interact with the elector 068 */ 069 @InterfaceAudience.Private 070 @InterfaceStability.Evolving 071 public class ActiveStandbyElector implements StatCallback, StringCallback { 072 073 /** 074 * Callback interface to interact with the ActiveStandbyElector object. <br/> 075 * The application will be notified with a callback only on state changes 076 * (i.e. there will never be successive calls to becomeActive without an 077 * intermediate call to enterNeutralMode). <br/> 078 * The callbacks will be running on Zookeeper client library threads. The 079 * application should return from these callbacks quickly so as not to impede 080 * Zookeeper client library performance and notifications. The app will 081 * typically remember the state change and return from the callback. It will 082 * then proceed with implementing actions around that state change. It is 083 * possible to be called back again while these actions are in flight and the 084 * app should handle this scenario. 085 */ 086 public interface ActiveStandbyElectorCallback { 087 /** 088 * This method is called when the app becomes the active leader. 089 * If the service fails to become active, it should throw 090 * ServiceFailedException. This will cause the elector to 091 * sleep for a short period, then re-join the election. 092 * 093 * Callback implementations are expected to manage their own 094 * timeouts (e.g. when making an RPC to a remote node). 095 */ 096 void becomeActive() throws ServiceFailedException; 097 098 /** 099 * This method is called when the app becomes a standby 100 */ 101 void becomeStandby(); 102 103 /** 104 * If the elector gets disconnected from Zookeeper and does not know about 105 * the lock state, then it will notify the service via the enterNeutralMode 106 * interface. The service may choose to ignore this or stop doing state 107 * changing operations. Upon reconnection, the elector verifies the leader 108 * status and calls back on the becomeActive and becomeStandby app 109 * interfaces. <br/> 110 * Zookeeper disconnects can happen due to network issues or loss of 111 * Zookeeper quorum. Thus enterNeutralMode can be used to guard against 112 * split-brain issues. In such situations it might be prudent to call 113 * becomeStandby too. However, such state change operations might be 114 * expensive and enterNeutralMode can help guard against doing that for 115 * transient issues. 116 */ 117 void enterNeutralMode(); 118 119 /** 120 * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper 121 * errors or Zookeeper persistent unavailability) then notifyFatalError is 122 * called to notify the app about it. 123 */ 124 void notifyFatalError(String errorMessage); 125 126 /** 127 * If an old active has failed, rather than exited gracefully, then 128 * the new active may need to take some fencing actions against it 129 * before proceeding with failover. 130 * 131 * @param oldActiveData the application data provided by the prior active 132 */ 133 void fenceOldActive(byte[] oldActiveData); 134 } 135 136 /** 137 * Name of the lock znode used by the library. Protected for access in test 138 * classes 139 */ 140 @VisibleForTesting 141 protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock"; 142 @VisibleForTesting 143 protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb"; 144 145 public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class); 146 147 static int NUM_RETRIES = 3; 148 private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000; 149 150 private static enum ConnectionState { 151 DISCONNECTED, CONNECTED, TERMINATED 152 }; 153 154 static enum State { 155 INIT, ACTIVE, STANDBY, NEUTRAL 156 }; 157 158 private State state = State.INIT; 159 private int createRetryCount = 0; 160 private int statRetryCount = 0; 161 private ZooKeeper zkClient; 162 private WatcherWithClientRef watcher; 163 private ConnectionState zkConnectionState = ConnectionState.TERMINATED; 164 165 private final ActiveStandbyElectorCallback appClient; 166 private final String zkHostPort; 167 private final int zkSessionTimeout; 168 private final List<ACL> zkAcl; 169 private final List<ZKAuthInfo> zkAuthInfo; 170 private byte[] appData; 171 private final String zkLockFilePath; 172 private final String zkBreadCrumbPath; 173 private final String znodeWorkingDir; 174 175 private Lock sessionReestablishLockForTests = new ReentrantLock(); 176 private boolean wantToBeInElection; 177 178 /** 179 * Create a new ActiveStandbyElector object <br/> 180 * The elector is created by providing to it the Zookeeper configuration, the 181 * parent znode under which to create the znode and a reference to the 182 * callback interface. <br/> 183 * The parent znode name must be the same for all service instances and 184 * different across services. <br/> 185 * After the leader has been lost, a new leader will be elected after the 186 * session timeout expires. Hence, the app must set this parameter based on 187 * its needs for failure response time. The session timeout must be greater 188 * than the Zookeeper disconnect timeout and is recommended to be 3X that 189 * value to enable Zookeeper to retry transient disconnections. Setting a very 190 * short session timeout may result in frequent transitions between active and 191 * standby states during issues like network outages/GS pauses. 192 * 193 * @param zookeeperHostPorts 194 * ZooKeeper hostPort for all ZooKeeper servers 195 * @param zookeeperSessionTimeout 196 * ZooKeeper session timeout 197 * @param parentZnodeName 198 * znode under which to create the lock 199 * @param acl 200 * ZooKeeper ACL's 201 * @param authInfo a list of authentication credentials to add to the 202 * ZK connection 203 * @param app 204 * reference to callback interface object 205 * @throws IOException 206 * @throws HadoopIllegalArgumentException 207 */ 208 public ActiveStandbyElector(String zookeeperHostPorts, 209 int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl, 210 List<ZKAuthInfo> authInfo, 211 ActiveStandbyElectorCallback app) throws IOException, 212 HadoopIllegalArgumentException, KeeperException { 213 if (app == null || acl == null || parentZnodeName == null 214 || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { 215 throw new HadoopIllegalArgumentException("Invalid argument"); 216 } 217 zkHostPort = zookeeperHostPorts; 218 zkSessionTimeout = zookeeperSessionTimeout; 219 zkAcl = acl; 220 zkAuthInfo = authInfo; 221 appClient = app; 222 znodeWorkingDir = parentZnodeName; 223 zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME; 224 zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; 225 226 // createConnection for future API calls 227 createConnection(); 228 } 229 230 /** 231 * To participate in election, the app will call joinElection. The result will 232 * be notified by a callback on either the becomeActive or becomeStandby app 233 * interfaces. <br/> 234 * After this the elector will automatically monitor the leader status and 235 * perform re-election if necessary<br/> 236 * The app could potentially start off in standby mode and ignore the 237 * becomeStandby call. 238 * 239 * @param data 240 * to be set by the app. non-null data must be set. 241 * @throws HadoopIllegalArgumentException 242 * if valid data is not supplied 243 */ 244 public synchronized void joinElection(byte[] data) 245 throws HadoopIllegalArgumentException { 246 247 if (data == null) { 248 throw new HadoopIllegalArgumentException("data cannot be null"); 249 } 250 251 if (wantToBeInElection) { 252 LOG.info("Already in election. Not re-connecting."); 253 return; 254 } 255 256 appData = new byte[data.length]; 257 System.arraycopy(data, 0, appData, 0, data.length); 258 259 LOG.debug("Attempting active election for " + this); 260 joinElectionInternal(); 261 } 262 263 /** 264 * @return true if the configured parent znode exists 265 */ 266 public synchronized boolean parentZNodeExists() 267 throws IOException, InterruptedException { 268 Preconditions.checkState(zkClient != null); 269 try { 270 return zkClient.exists(znodeWorkingDir, false) != null; 271 } catch (KeeperException e) { 272 throw new IOException("Couldn't determine existence of znode '" + 273 znodeWorkingDir + "'", e); 274 } 275 } 276 277 /** 278 * Utility function to ensure that the configured base znode exists. 279 * This recursively creates the znode as well as all of its parents. 280 */ 281 public synchronized void ensureParentZNode() 282 throws IOException, InterruptedException { 283 Preconditions.checkState(!wantToBeInElection, 284 "ensureParentZNode() may not be called while in the election"); 285 286 String pathParts[] = znodeWorkingDir.split("/"); 287 Preconditions.checkArgument(pathParts.length >= 1 && 288 "".equals(pathParts[0]), 289 "Invalid path: %s", znodeWorkingDir); 290 291 StringBuilder sb = new StringBuilder(); 292 for (int i = 1; i < pathParts.length; i++) { 293 sb.append("/").append(pathParts[i]); 294 String prefixPath = sb.toString(); 295 LOG.debug("Ensuring existence of " + prefixPath); 296 try { 297 createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT); 298 } catch (KeeperException e) { 299 if (isNodeExists(e.code())) { 300 // This is OK - just ensuring existence. 301 continue; 302 } else { 303 throw new IOException("Couldn't create " + prefixPath, e); 304 } 305 } 306 } 307 308 LOG.info("Successfully created " + znodeWorkingDir + " in ZK."); 309 } 310 311 /** 312 * Clear all of the state held within the parent ZNode. 313 * This recursively deletes everything within the znode as well as the 314 * parent znode itself. It should only be used when it's certain that 315 * no electors are currently participating in the election. 316 */ 317 public synchronized void clearParentZNode() 318 throws IOException, InterruptedException { 319 Preconditions.checkState(!wantToBeInElection, 320 "clearParentZNode() may not be called while in the election"); 321 322 try { 323 LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK..."); 324 325 zkDoWithRetries(new ZKAction<Void>() { 326 @Override 327 public Void run() throws KeeperException, InterruptedException { 328 ZKUtil.deleteRecursive(zkClient, znodeWorkingDir); 329 return null; 330 } 331 }); 332 } catch (KeeperException e) { 333 throw new IOException("Couldn't clear parent znode " + znodeWorkingDir, 334 e); 335 } 336 LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK."); 337 } 338 339 340 /** 341 * Any service instance can drop out of the election by calling quitElection. 342 * <br/> 343 * This will lose any leader status, if held, and stop monitoring of the lock 344 * node. <br/> 345 * If the instance wants to participate in election again, then it needs to 346 * call joinElection(). <br/> 347 * This allows service instances to take themselves out of rotation for known 348 * impending unavailable states (e.g. long GC pause or software upgrade). 349 * 350 * @param needFence true if the underlying daemon may need to be fenced 351 * if a failover occurs due to dropping out of the election. 352 */ 353 public synchronized void quitElection(boolean needFence) { 354 LOG.info("Yielding from election"); 355 if (!needFence && state == State.ACTIVE) { 356 // If active is gracefully going back to standby mode, remove 357 // our permanent znode so no one fences us. 358 tryDeleteOwnBreadCrumbNode(); 359 } 360 reset(); 361 wantToBeInElection = false; 362 } 363 364 /** 365 * Exception thrown when there is no active leader 366 */ 367 public static class ActiveNotFoundException extends Exception { 368 private static final long serialVersionUID = 3505396722342846462L; 369 } 370 371 /** 372 * get data set by the active leader 373 * 374 * @return data set by the active instance 375 * @throws ActiveNotFoundException 376 * when there is no active leader 377 * @throws KeeperException 378 * other zookeeper operation errors 379 * @throws InterruptedException 380 * @throws IOException 381 * when ZooKeeper connection could not be established 382 */ 383 public synchronized byte[] getActiveData() throws ActiveNotFoundException, 384 KeeperException, InterruptedException, IOException { 385 try { 386 if (zkClient == null) { 387 createConnection(); 388 } 389 Stat stat = new Stat(); 390 return getDataWithRetries(zkLockFilePath, false, stat); 391 } catch(KeeperException e) { 392 Code code = e.code(); 393 if (isNodeDoesNotExist(code)) { 394 // handle the commonly expected cases that make sense for us 395 throw new ActiveNotFoundException(); 396 } else { 397 throw e; 398 } 399 } 400 } 401 402 /** 403 * interface implementation of Zookeeper callback for create 404 */ 405 @Override 406 public synchronized void processResult(int rc, String path, Object ctx, 407 String name) { 408 if (isStaleClient(ctx)) return; 409 LOG.debug("CreateNode result: " + rc + " for path: " + path 410 + " connectionState: " + zkConnectionState + 411 " for " + this); 412 413 Code code = Code.get(rc); 414 if (isSuccess(code)) { 415 // we successfully created the znode. we are the leader. start monitoring 416 if (becomeActive()) { 417 monitorActiveStatus(); 418 } else { 419 reJoinElectionAfterFailureToBecomeActive(); 420 } 421 return; 422 } 423 424 if (isNodeExists(code)) { 425 if (createRetryCount == 0) { 426 // znode exists and we did not retry the operation. so a different 427 // instance has created it. become standby and monitor lock. 428 becomeStandby(); 429 } 430 // if we had retried then the znode could have been created by our first 431 // attempt to the server (that we lost) and this node exists response is 432 // for the second attempt. verify this case via ephemeral node owner. this 433 // will happen on the callback for monitoring the lock. 434 monitorActiveStatus(); 435 return; 436 } 437 438 String errorMessage = "Received create error from Zookeeper. code:" 439 + code.toString() + " for path " + path; 440 LOG.debug(errorMessage); 441 442 if (shouldRetry(code)) { 443 if (createRetryCount < NUM_RETRIES) { 444 LOG.debug("Retrying createNode createRetryCount: " + createRetryCount); 445 ++createRetryCount; 446 createLockNodeAsync(); 447 return; 448 } 449 errorMessage = errorMessage 450 + ". Not retrying further znode create connection errors."; 451 } else if (isSessionExpired(code)) { 452 // This isn't fatal - the client Watcher will re-join the election 453 LOG.warn("Lock acquisition failed because session was lost"); 454 return; 455 } 456 457 fatalError(errorMessage); 458 } 459 460 /** 461 * interface implementation of Zookeeper callback for monitor (exists) 462 */ 463 @Override 464 public synchronized void processResult(int rc, String path, Object ctx, 465 Stat stat) { 466 if (isStaleClient(ctx)) return; 467 468 assert wantToBeInElection : 469 "Got a StatNode result after quitting election"; 470 471 LOG.debug("StatNode result: " + rc + " for path: " + path 472 + " connectionState: " + zkConnectionState + " for " + this); 473 474 475 Code code = Code.get(rc); 476 if (isSuccess(code)) { 477 // the following owner check completes verification in case the lock znode 478 // creation was retried 479 if (stat.getEphemeralOwner() == zkClient.getSessionId()) { 480 // we own the lock znode. so we are the leader 481 if (!becomeActive()) { 482 reJoinElectionAfterFailureToBecomeActive(); 483 } 484 } else { 485 // we dont own the lock znode. so we are a standby. 486 becomeStandby(); 487 } 488 // the watch set by us will notify about changes 489 return; 490 } 491 492 if (isNodeDoesNotExist(code)) { 493 // the lock znode disappeared before we started monitoring it 494 enterNeutralMode(); 495 joinElectionInternal(); 496 return; 497 } 498 499 String errorMessage = "Received stat error from Zookeeper. code:" 500 + code.toString(); 501 LOG.debug(errorMessage); 502 503 if (shouldRetry(code)) { 504 if (statRetryCount < NUM_RETRIES) { 505 ++statRetryCount; 506 monitorLockNodeAsync(); 507 return; 508 } 509 errorMessage = errorMessage 510 + ". Not retrying further znode monitoring connection errors."; 511 } else if (isSessionExpired(code)) { 512 // This isn't fatal - the client Watcher will re-join the election 513 LOG.warn("Lock monitoring failed because session was lost"); 514 return; 515 } 516 517 fatalError(errorMessage); 518 } 519 520 /** 521 * We failed to become active. Re-join the election, but 522 * sleep for a few seconds after terminating our existing 523 * session, so that other nodes have a chance to become active. 524 * The failure to become active is already logged inside 525 * becomeActive(). 526 */ 527 private void reJoinElectionAfterFailureToBecomeActive() { 528 reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE); 529 } 530 531 /** 532 * interface implementation of Zookeeper watch events (connection and node), 533 * proxied by {@link WatcherWithClientRef}. 534 */ 535 synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) { 536 Event.EventType eventType = event.getType(); 537 if (isStaleClient(zk)) return; 538 LOG.debug("Watcher event type: " + eventType + " with state:" 539 + event.getState() + " for path:" + event.getPath() 540 + " connectionState: " + zkConnectionState 541 + " for " + this); 542 543 if (eventType == Event.EventType.None) { 544 // the connection state has changed 545 switch (event.getState()) { 546 case SyncConnected: 547 LOG.info("Session connected."); 548 // if the listener was asked to move to safe state then it needs to 549 // be undone 550 ConnectionState prevConnectionState = zkConnectionState; 551 zkConnectionState = ConnectionState.CONNECTED; 552 if (prevConnectionState == ConnectionState.DISCONNECTED && 553 wantToBeInElection) { 554 monitorActiveStatus(); 555 } 556 break; 557 case Disconnected: 558 LOG.info("Session disconnected. Entering neutral mode..."); 559 560 // ask the app to move to safe state because zookeeper connection 561 // is not active and we dont know our state 562 zkConnectionState = ConnectionState.DISCONNECTED; 563 enterNeutralMode(); 564 break; 565 case Expired: 566 // the connection got terminated because of session timeout 567 // call listener to reconnect 568 LOG.info("Session expired. Entering neutral mode and rejoining..."); 569 enterNeutralMode(); 570 reJoinElection(0); 571 break; 572 default: 573 fatalError("Unexpected Zookeeper watch event state: " 574 + event.getState()); 575 break; 576 } 577 578 return; 579 } 580 581 // a watch on lock path in zookeeper has fired. so something has changed on 582 // the lock. ideally we should check that the path is the same as the lock 583 // path but trusting zookeeper for now 584 String path = event.getPath(); 585 if (path != null) { 586 switch (eventType) { 587 case NodeDeleted: 588 if (state == State.ACTIVE) { 589 enterNeutralMode(); 590 } 591 joinElectionInternal(); 592 break; 593 case NodeDataChanged: 594 monitorActiveStatus(); 595 break; 596 default: 597 LOG.debug("Unexpected node event: " + eventType + " for path: " + path); 598 monitorActiveStatus(); 599 } 600 601 return; 602 } 603 604 // some unexpected error has occurred 605 fatalError("Unexpected watch error from Zookeeper"); 606 } 607 608 /** 609 * Get a new zookeeper client instance. protected so that test class can 610 * inherit and pass in a mock object for zookeeper 611 * 612 * @return new zookeeper client instance 613 * @throws IOException 614 * @throws KeeperException zookeeper connectionloss exception 615 */ 616 protected synchronized ZooKeeper getNewZooKeeper() throws IOException, 617 KeeperException { 618 619 // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and 620 // may trigger the Connected event immediately. So, if we register the 621 // watcher after constructing ZooKeeper, we may miss that event. Instead, 622 // we construct the watcher first, and have it block any events it receives 623 // before we can set its ZooKeeper reference. 624 watcher = new WatcherWithClientRef(); 625 ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher); 626 watcher.setZooKeeperRef(zk); 627 628 // Wait for the asynchronous success/failure. This may throw an exception 629 // if we don't connect within the session timeout. 630 watcher.waitForZKConnectionEvent(zkSessionTimeout); 631 632 for (ZKAuthInfo auth : zkAuthInfo) { 633 zk.addAuthInfo(auth.getScheme(), auth.getAuth()); 634 } 635 return zk; 636 } 637 638 private void fatalError(String errorMessage) { 639 LOG.fatal(errorMessage); 640 reset(); 641 appClient.notifyFatalError(errorMessage); 642 } 643 644 private void monitorActiveStatus() { 645 assert wantToBeInElection; 646 LOG.debug("Monitoring active leader for " + this); 647 statRetryCount = 0; 648 monitorLockNodeAsync(); 649 } 650 651 private void joinElectionInternal() { 652 Preconditions.checkState(appData != null, 653 "trying to join election without any app data"); 654 if (zkClient == null) { 655 if (!reEstablishSession()) { 656 fatalError("Failed to reEstablish connection with ZooKeeper"); 657 return; 658 } 659 } 660 661 createRetryCount = 0; 662 wantToBeInElection = true; 663 createLockNodeAsync(); 664 } 665 666 private void reJoinElection(int sleepTime) { 667 LOG.info("Trying to re-establish ZK session"); 668 669 // Some of the test cases rely on expiring the ZK sessions and 670 // ensuring that the other node takes over. But, there's a race 671 // where the original lease holder could reconnect faster than the other 672 // thread manages to take the lock itself. This lock allows the 673 // tests to block the reconnection. It's a shame that this leaked 674 // into non-test code, but the lock is only acquired here so will never 675 // be contended. 676 sessionReestablishLockForTests.lock(); 677 try { 678 terminateConnection(); 679 sleepFor(sleepTime); 680 // Should not join election even before the SERVICE is reported 681 // as HEALTHY from ZKFC monitoring. 682 if (appData != null) { 683 joinElectionInternal(); 684 } else { 685 LOG.info("Not joining election since service has not yet been " + 686 "reported as healthy."); 687 } 688 } finally { 689 sessionReestablishLockForTests.unlock(); 690 } 691 } 692 693 /** 694 * Sleep for the given number of milliseconds. 695 * This is non-static, and separated out, so that unit tests 696 * can override the behavior not to sleep. 697 */ 698 @VisibleForTesting 699 protected void sleepFor(int sleepMs) { 700 if (sleepMs > 0) { 701 try { 702 Thread.sleep(sleepMs); 703 } catch (InterruptedException e) { 704 Thread.currentThread().interrupt(); 705 } 706 } 707 } 708 709 @VisibleForTesting 710 void preventSessionReestablishmentForTests() { 711 sessionReestablishLockForTests.lock(); 712 } 713 714 @VisibleForTesting 715 void allowSessionReestablishmentForTests() { 716 sessionReestablishLockForTests.unlock(); 717 } 718 719 @VisibleForTesting 720 synchronized long getZKSessionIdForTests() { 721 if (zkClient != null) { 722 return zkClient.getSessionId(); 723 } else { 724 return -1; 725 } 726 } 727 728 @VisibleForTesting 729 synchronized State getStateForTests() { 730 return state; 731 } 732 733 private boolean reEstablishSession() { 734 int connectionRetryCount = 0; 735 boolean success = false; 736 while(!success && connectionRetryCount < NUM_RETRIES) { 737 LOG.debug("Establishing zookeeper connection for " + this); 738 try { 739 createConnection(); 740 success = true; 741 } catch(IOException e) { 742 LOG.warn(e); 743 sleepFor(5000); 744 } catch(KeeperException e) { 745 LOG.warn(e); 746 sleepFor(5000); 747 } 748 ++connectionRetryCount; 749 } 750 return success; 751 } 752 753 private void createConnection() throws IOException, KeeperException { 754 if (zkClient != null) { 755 try { 756 zkClient.close(); 757 } catch (InterruptedException e) { 758 throw new IOException("Interrupted while closing ZK", 759 e); 760 } 761 zkClient = null; 762 watcher = null; 763 } 764 zkClient = getNewZooKeeper(); 765 LOG.debug("Created new connection for " + this); 766 } 767 768 void terminateConnection() { 769 if (zkClient == null) { 770 return; 771 } 772 LOG.debug("Terminating ZK connection for " + this); 773 ZooKeeper tempZk = zkClient; 774 zkClient = null; 775 watcher = null; 776 try { 777 tempZk.close(); 778 } catch(InterruptedException e) { 779 LOG.warn(e); 780 } 781 zkConnectionState = ConnectionState.TERMINATED; 782 wantToBeInElection = false; 783 } 784 785 private void reset() { 786 state = State.INIT; 787 terminateConnection(); 788 } 789 790 private boolean becomeActive() { 791 assert wantToBeInElection; 792 if (state == State.ACTIVE) { 793 // already active 794 return true; 795 } 796 try { 797 Stat oldBreadcrumbStat = fenceOldActive(); 798 writeBreadCrumbNode(oldBreadcrumbStat); 799 800 LOG.debug("Becoming active for " + this); 801 appClient.becomeActive(); 802 state = State.ACTIVE; 803 return true; 804 } catch (Exception e) { 805 LOG.warn("Exception handling the winning of election", e); 806 // Caller will handle quitting and rejoining the election. 807 return false; 808 } 809 } 810 811 /** 812 * Write the "ActiveBreadCrumb" node, indicating that this node may need 813 * to be fenced on failover. 814 * @param oldBreadcrumbStat 815 */ 816 private void writeBreadCrumbNode(Stat oldBreadcrumbStat) 817 throws KeeperException, InterruptedException { 818 Preconditions.checkState(appData != null, "no appdata"); 819 820 LOG.info("Writing znode " + zkBreadCrumbPath + 821 " to indicate that the local node is the most recent active..."); 822 if (oldBreadcrumbStat == null) { 823 // No previous active, just create the node 824 createWithRetries(zkBreadCrumbPath, appData, zkAcl, 825 CreateMode.PERSISTENT); 826 } else { 827 // There was a previous active, update the node 828 setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion()); 829 } 830 } 831 832 /** 833 * Try to delete the "ActiveBreadCrumb" node when gracefully giving up 834 * active status. 835 * If this fails, it will simply warn, since the graceful release behavior 836 * is only an optimization. 837 */ 838 private void tryDeleteOwnBreadCrumbNode() { 839 assert state == State.ACTIVE; 840 LOG.info("Deleting bread-crumb of active node..."); 841 842 // Sanity check the data. This shouldn't be strictly necessary, 843 // but better to play it safe. 844 Stat stat = new Stat(); 845 byte[] data = null; 846 try { 847 data = zkClient.getData(zkBreadCrumbPath, false, stat); 848 849 if (!Arrays.equals(data, appData)) { 850 throw new IllegalStateException( 851 "We thought we were active, but in fact " + 852 "the active znode had the wrong data: " + 853 StringUtils.byteToHexString(data) + " (stat=" + stat + ")"); 854 } 855 856 deleteWithRetries(zkBreadCrumbPath, stat.getVersion()); 857 } catch (Exception e) { 858 LOG.warn("Unable to delete our own bread-crumb of being active at " + 859 zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " + 860 "Expecting to be fenced by the next active."); 861 } 862 } 863 864 /** 865 * If there is a breadcrumb node indicating that another node may need 866 * fencing, try to fence that node. 867 * @return the Stat of the breadcrumb node that was read, or null 868 * if no breadcrumb node existed 869 */ 870 private Stat fenceOldActive() throws InterruptedException, KeeperException { 871 final Stat stat = new Stat(); 872 byte[] data; 873 LOG.info("Checking for any old active which needs to be fenced..."); 874 try { 875 data = zkDoWithRetries(new ZKAction<byte[]>() { 876 @Override 877 public byte[] run() throws KeeperException, InterruptedException { 878 return zkClient.getData(zkBreadCrumbPath, false, stat); 879 } 880 }); 881 } catch (KeeperException ke) { 882 if (isNodeDoesNotExist(ke.code())) { 883 LOG.info("No old node to fence"); 884 return null; 885 } 886 887 // If we failed to read for any other reason, then likely we lost 888 // our session, or we don't have permissions, etc. In any case, 889 // we probably shouldn't become active, and failing the whole 890 // thing is the best bet. 891 throw ke; 892 } 893 894 LOG.info("Old node exists: " + StringUtils.byteToHexString(data)); 895 if (Arrays.equals(data, appData)) { 896 LOG.info("But old node has our own data, so don't need to fence it."); 897 } else { 898 appClient.fenceOldActive(data); 899 } 900 return stat; 901 } 902 903 private void becomeStandby() { 904 if (state != State.STANDBY) { 905 LOG.debug("Becoming standby for " + this); 906 state = State.STANDBY; 907 appClient.becomeStandby(); 908 } 909 } 910 911 private void enterNeutralMode() { 912 if (state != State.NEUTRAL) { 913 LOG.debug("Entering neutral mode for " + this); 914 state = State.NEUTRAL; 915 appClient.enterNeutralMode(); 916 } 917 } 918 919 private void createLockNodeAsync() { 920 zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL, 921 this, zkClient); 922 } 923 924 private void monitorLockNodeAsync() { 925 zkClient.exists(zkLockFilePath, 926 watcher, this, 927 zkClient); 928 } 929 930 private String createWithRetries(final String path, final byte[] data, 931 final List<ACL> acl, final CreateMode mode) 932 throws InterruptedException, KeeperException { 933 return zkDoWithRetries(new ZKAction<String>() { 934 @Override 935 public String run() throws KeeperException, InterruptedException { 936 return zkClient.create(path, data, acl, mode); 937 } 938 }); 939 } 940 941 private byte[] getDataWithRetries(final String path, final boolean watch, 942 final Stat stat) throws InterruptedException, KeeperException { 943 return zkDoWithRetries(new ZKAction<byte[]>() { 944 @Override 945 public byte[] run() throws KeeperException, InterruptedException { 946 return zkClient.getData(path, watch, stat); 947 } 948 }); 949 } 950 951 private Stat setDataWithRetries(final String path, final byte[] data, 952 final int version) throws InterruptedException, KeeperException { 953 return zkDoWithRetries(new ZKAction<Stat>() { 954 @Override 955 public Stat run() throws KeeperException, InterruptedException { 956 return zkClient.setData(path, data, version); 957 } 958 }); 959 } 960 961 private void deleteWithRetries(final String path, final int version) 962 throws KeeperException, InterruptedException { 963 zkDoWithRetries(new ZKAction<Void>() { 964 @Override 965 public Void run() throws KeeperException, InterruptedException { 966 zkClient.delete(path, version); 967 return null; 968 } 969 }); 970 } 971 972 private static <T> T zkDoWithRetries(ZKAction<T> action) 973 throws KeeperException, InterruptedException { 974 int retry = 0; 975 while (true) { 976 try { 977 return action.run(); 978 } catch (KeeperException ke) { 979 if (shouldRetry(ke.code()) && ++retry < NUM_RETRIES) { 980 continue; 981 } 982 throw ke; 983 } 984 } 985 } 986 987 private interface ZKAction<T> { 988 T run() throws KeeperException, InterruptedException; 989 } 990 991 /** 992 * The callbacks and watchers pass a reference to the ZK client 993 * which made the original call. We don't want to take action 994 * based on any callbacks from prior clients after we quit 995 * the election. 996 * @param ctx the ZK client passed into the watcher 997 * @return true if it matches the current client 998 */ 999 private synchronized boolean isStaleClient(Object ctx) { 1000 Preconditions.checkNotNull(ctx); 1001 if (zkClient != (ZooKeeper)ctx) { 1002 LOG.warn("Ignoring stale result from old client with sessionId " + 1003 String.format("0x%08x", ((ZooKeeper)ctx).getSessionId())); 1004 return true; 1005 } 1006 return false; 1007 } 1008 1009 /** 1010 * Watcher implementation which keeps a reference around to the 1011 * original ZK connection, and passes it back along with any 1012 * events. 1013 */ 1014 private final class WatcherWithClientRef implements Watcher { 1015 private ZooKeeper zk; 1016 1017 /** 1018 * Latch fired whenever any event arrives. This is used in order 1019 * to wait for the Connected event when the client is first created. 1020 */ 1021 private CountDownLatch hasReceivedEvent = new CountDownLatch(1); 1022 1023 /** 1024 * Latch used to wait until the reference to ZooKeeper is set. 1025 */ 1026 private CountDownLatch hasSetZooKeeper = new CountDownLatch(1); 1027 1028 /** 1029 * Waits for the next event from ZooKeeper to arrive. 1030 * 1031 * @param connectionTimeoutMs zookeeper connection timeout in milliseconds 1032 * @throws KeeperException if the connection attempt times out. This will 1033 * be a ZooKeeper ConnectionLoss exception code. 1034 * @throws IOException if interrupted while connecting to ZooKeeper 1035 */ 1036 private void waitForZKConnectionEvent(int connectionTimeoutMs) 1037 throws KeeperException, IOException { 1038 try { 1039 if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) { 1040 LOG.error("Connection timed out: couldn't connect to ZooKeeper in " 1041 + connectionTimeoutMs + " milliseconds"); 1042 zk.close(); 1043 throw KeeperException.create(Code.CONNECTIONLOSS); 1044 } 1045 } catch (InterruptedException e) { 1046 Thread.currentThread().interrupt(); 1047 throw new IOException( 1048 "Interrupted when connecting to zookeeper server", e); 1049 } 1050 } 1051 1052 private void setZooKeeperRef(ZooKeeper zk) { 1053 Preconditions.checkState(this.zk == null, 1054 "zk already set -- must be set exactly once"); 1055 this.zk = zk; 1056 hasSetZooKeeper.countDown(); 1057 } 1058 1059 @Override 1060 public void process(WatchedEvent event) { 1061 hasReceivedEvent.countDown(); 1062 try { 1063 hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS); 1064 ActiveStandbyElector.this.processWatchEvent( 1065 zk, event); 1066 } catch (Throwable t) { 1067 fatalError( 1068 "Failed to process watcher event " + event + ": " + 1069 StringUtils.stringifyException(t)); 1070 } 1071 } 1072 } 1073 1074 private static boolean isSuccess(Code code) { 1075 return (code == Code.OK); 1076 } 1077 1078 private static boolean isNodeExists(Code code) { 1079 return (code == Code.NODEEXISTS); 1080 } 1081 1082 private static boolean isNodeDoesNotExist(Code code) { 1083 return (code == Code.NONODE); 1084 } 1085 1086 private static boolean isSessionExpired(Code code) { 1087 return (code == Code.SESSIONEXPIRED); 1088 } 1089 1090 private static boolean shouldRetry(Code code) { 1091 switch (code) { 1092 case CONNECTIONLOSS: 1093 case OPERATIONTIMEOUT: 1094 return true; 1095 } 1096 return false; 1097 } 1098 1099 @Override 1100 public String toString() { 1101 return "elector id=" + System.identityHashCode(this) + 1102 " appData=" + 1103 ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 1104 " cb=" + appClient; 1105 } 1106 }