001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.ha; 020 021import java.io.IOException; 022import java.util.Arrays; 023import java.util.List; 024import java.util.concurrent.CountDownLatch; 025import java.util.concurrent.TimeUnit; 026import java.util.concurrent.locks.Lock; 027import java.util.concurrent.locks.ReentrantLock; 028 029import org.apache.commons.logging.Log; 030import org.apache.commons.logging.LogFactory; 031import org.apache.hadoop.HadoopIllegalArgumentException; 032import org.apache.hadoop.classification.InterfaceAudience; 033import org.apache.hadoop.classification.InterfaceStability; 034import org.apache.hadoop.util.ZKUtil.ZKAuthInfo; 035import org.apache.hadoop.util.StringUtils; 036import org.apache.zookeeper.data.ACL; 037import org.apache.zookeeper.KeeperException; 038import org.apache.zookeeper.Watcher; 039import org.apache.zookeeper.WatchedEvent; 040import org.apache.zookeeper.Watcher.Event; 041import org.apache.zookeeper.ZKUtil; 042import org.apache.zookeeper.ZooKeeper; 043import org.apache.zookeeper.CreateMode; 044import org.apache.zookeeper.AsyncCallback.*; 045import org.apache.zookeeper.data.Stat; 046import org.apache.zookeeper.KeeperException.Code; 047 048import com.google.common.annotations.VisibleForTesting; 049import com.google.common.base.Preconditions; 050 051/** 052 * 053 * This class implements a simple library to perform leader election on top of 054 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election 055 * can be performed by atomically creating an ephemeral lock file (znode) on 056 * Zookeeper. The service instance that successfully creates the znode becomes 057 * active and the rest become standbys. <br/> 058 * This election mechanism is only efficient for small number of election 059 * candidates (order of 10's) because contention on single znode by a large 060 * number of candidates can result in Zookeeper overload. <br/> 061 * The elector does not guarantee fencing (protection of shared resources) among 062 * service instances. After it has notified an instance about becoming a leader, 063 * then that instance must ensure that it meets the service consistency 064 * requirements. If it cannot do so, then it is recommended to quit the 065 * election. The application implements the {@link ActiveStandbyElectorCallback} 066 * to interact with the elector 067 */ 068@InterfaceAudience.Private 069@InterfaceStability.Evolving 070public class ActiveStandbyElector implements StatCallback, StringCallback { 071 072 /** 073 * Callback interface to interact with the ActiveStandbyElector object. <br/> 074 * The application will be notified with a callback only on state changes 075 * (i.e. there will never be successive calls to becomeActive without an 076 * intermediate call to enterNeutralMode). <br/> 077 * The callbacks will be running on Zookeeper client library threads. The 078 * application should return from these callbacks quickly so as not to impede 079 * Zookeeper client library performance and notifications. The app will 080 * typically remember the state change and return from the callback. It will 081 * then proceed with implementing actions around that state change. It is 082 * possible to be called back again while these actions are in flight and the 083 * app should handle this scenario. 084 */ 085 public interface ActiveStandbyElectorCallback { 086 /** 087 * This method is called when the app becomes the active leader. 088 * If the service fails to become active, it should throw 089 * ServiceFailedException. This will cause the elector to 090 * sleep for a short period, then re-join the election. 091 * 092 * Callback implementations are expected to manage their own 093 * timeouts (e.g. when making an RPC to a remote node). 094 */ 095 void becomeActive() throws ServiceFailedException; 096 097 /** 098 * This method is called when the app becomes a standby 099 */ 100 void becomeStandby(); 101 102 /** 103 * If the elector gets disconnected from Zookeeper and does not know about 104 * the lock state, then it will notify the service via the enterNeutralMode 105 * interface. The service may choose to ignore this or stop doing state 106 * changing operations. Upon reconnection, the elector verifies the leader 107 * status and calls back on the becomeActive and becomeStandby app 108 * interfaces. <br/> 109 * Zookeeper disconnects can happen due to network issues or loss of 110 * Zookeeper quorum. Thus enterNeutralMode can be used to guard against 111 * split-brain issues. In such situations it might be prudent to call 112 * becomeStandby too. However, such state change operations might be 113 * expensive and enterNeutralMode can help guard against doing that for 114 * transient issues. 115 */ 116 void enterNeutralMode(); 117 118 /** 119 * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper 120 * errors or Zookeeper persistent unavailability) then notifyFatalError is 121 * called to notify the app about it. 122 */ 123 void notifyFatalError(String errorMessage); 124 125 /** 126 * If an old active has failed, rather than exited gracefully, then 127 * the new active may need to take some fencing actions against it 128 * before proceeding with failover. 129 * 130 * @param oldActiveData the application data provided by the prior active 131 */ 132 void fenceOldActive(byte[] oldActiveData); 133 } 134 135 /** 136 * Name of the lock znode used by the library. Protected for access in test 137 * classes 138 */ 139 @VisibleForTesting 140 protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock"; 141 @VisibleForTesting 142 protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb"; 143 144 public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class); 145 146 private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000; 147 148 private static enum ConnectionState { 149 DISCONNECTED, CONNECTED, TERMINATED 150 }; 151 152 static enum State { 153 INIT, ACTIVE, STANDBY, NEUTRAL 154 }; 155 156 private State state = State.INIT; 157 private int createRetryCount = 0; 158 private int statRetryCount = 0; 159 private ZooKeeper zkClient; 160 private WatcherWithClientRef watcher; 161 private ConnectionState zkConnectionState = ConnectionState.TERMINATED; 162 163 private final ActiveStandbyElectorCallback appClient; 164 private final String zkHostPort; 165 private final int zkSessionTimeout; 166 private final List<ACL> zkAcl; 167 private final List<ZKAuthInfo> zkAuthInfo; 168 private byte[] appData; 169 private final String zkLockFilePath; 170 private final String zkBreadCrumbPath; 171 private final String znodeWorkingDir; 172 private final int maxRetryNum; 173 174 private Lock sessionReestablishLockForTests = new ReentrantLock(); 175 private boolean wantToBeInElection; 176 private boolean monitorLockNodePending = false; 177 private ZooKeeper monitorLockNodeClient; 178 179 /** 180 * Create a new ActiveStandbyElector object <br/> 181 * The elector is created by providing to it the Zookeeper configuration, the 182 * parent znode under which to create the znode and a reference to the 183 * callback interface. <br/> 184 * The parent znode name must be the same for all service instances and 185 * different across services. <br/> 186 * After the leader has been lost, a new leader will be elected after the 187 * session timeout expires. Hence, the app must set this parameter based on 188 * its needs for failure response time. The session timeout must be greater 189 * than the Zookeeper disconnect timeout and is recommended to be 3X that 190 * value to enable Zookeeper to retry transient disconnections. Setting a very 191 * short session timeout may result in frequent transitions between active and 192 * standby states during issues like network outages/GS pauses. 193 * 194 * @param zookeeperHostPorts 195 * ZooKeeper hostPort for all ZooKeeper servers 196 * @param zookeeperSessionTimeout 197 * ZooKeeper session timeout 198 * @param parentZnodeName 199 * znode under which to create the lock 200 * @param acl 201 * ZooKeeper ACL's 202 * @param authInfo a list of authentication credentials to add to the 203 * ZK connection 204 * @param app 205 * reference to callback interface object 206 * @throws IOException 207 * @throws HadoopIllegalArgumentException 208 */ 209 public ActiveStandbyElector(String zookeeperHostPorts, 210 int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl, 211 List<ZKAuthInfo> authInfo, 212 ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException, 213 HadoopIllegalArgumentException, KeeperException { 214 if (app == null || acl == null || parentZnodeName == null 215 || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { 216 throw new HadoopIllegalArgumentException("Invalid argument"); 217 } 218 zkHostPort = zookeeperHostPorts; 219 zkSessionTimeout = zookeeperSessionTimeout; 220 zkAcl = acl; 221 zkAuthInfo = authInfo; 222 appClient = app; 223 znodeWorkingDir = parentZnodeName; 224 zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME; 225 zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; 226 this.maxRetryNum = maxRetryNum; 227 228 // createConnection for future API calls 229 createConnection(); 230 } 231 232 /** 233 * To participate in election, the app will call joinElection. The result will 234 * be notified by a callback on either the becomeActive or becomeStandby app 235 * interfaces. <br/> 236 * After this the elector will automatically monitor the leader status and 237 * perform re-election if necessary<br/> 238 * The app could potentially start off in standby mode and ignore the 239 * becomeStandby call. 240 * 241 * @param data 242 * to be set by the app. non-null data must be set. 243 * @throws HadoopIllegalArgumentException 244 * if valid data is not supplied 245 */ 246 public synchronized void joinElection(byte[] data) 247 throws HadoopIllegalArgumentException { 248 249 if (data == null) { 250 throw new HadoopIllegalArgumentException("data cannot be null"); 251 } 252 253 if (wantToBeInElection) { 254 LOG.info("Already in election. Not re-connecting."); 255 return; 256 } 257 258 appData = new byte[data.length]; 259 System.arraycopy(data, 0, appData, 0, data.length); 260 261 LOG.debug("Attempting active election for " + this); 262 joinElectionInternal(); 263 } 264 265 /** 266 * @return true if the configured parent znode exists 267 */ 268 public synchronized boolean parentZNodeExists() 269 throws IOException, InterruptedException { 270 Preconditions.checkState(zkClient != null); 271 try { 272 return zkClient.exists(znodeWorkingDir, false) != null; 273 } catch (KeeperException e) { 274 throw new IOException("Couldn't determine existence of znode '" + 275 znodeWorkingDir + "'", e); 276 } 277 } 278 279 /** 280 * Utility function to ensure that the configured base znode exists. 281 * This recursively creates the znode as well as all of its parents. 282 */ 283 public synchronized void ensureParentZNode() 284 throws IOException, InterruptedException { 285 Preconditions.checkState(!wantToBeInElection, 286 "ensureParentZNode() may not be called while in the election"); 287 288 String pathParts[] = znodeWorkingDir.split("/"); 289 Preconditions.checkArgument(pathParts.length >= 1 && 290 pathParts[0].isEmpty(), 291 "Invalid path: %s", znodeWorkingDir); 292 293 StringBuilder sb = new StringBuilder(); 294 for (int i = 1; i < pathParts.length; i++) { 295 sb.append("/").append(pathParts[i]); 296 String prefixPath = sb.toString(); 297 LOG.debug("Ensuring existence of " + prefixPath); 298 try { 299 createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT); 300 } catch (KeeperException e) { 301 if (isNodeExists(e.code())) { 302 // This is OK - just ensuring existence. 303 continue; 304 } else { 305 throw new IOException("Couldn't create " + prefixPath, e); 306 } 307 } 308 } 309 310 LOG.info("Successfully created " + znodeWorkingDir + " in ZK."); 311 } 312 313 /** 314 * Clear all of the state held within the parent ZNode. 315 * This recursively deletes everything within the znode as well as the 316 * parent znode itself. It should only be used when it's certain that 317 * no electors are currently participating in the election. 318 */ 319 public synchronized void clearParentZNode() 320 throws IOException, InterruptedException { 321 Preconditions.checkState(!wantToBeInElection, 322 "clearParentZNode() may not be called while in the election"); 323 324 try { 325 LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK..."); 326 327 zkDoWithRetries(new ZKAction<Void>() { 328 @Override 329 public Void run() throws KeeperException, InterruptedException { 330 ZKUtil.deleteRecursive(zkClient, znodeWorkingDir); 331 return null; 332 } 333 }); 334 } catch (KeeperException e) { 335 throw new IOException("Couldn't clear parent znode " + znodeWorkingDir, 336 e); 337 } 338 LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK."); 339 } 340 341 342 /** 343 * Any service instance can drop out of the election by calling quitElection. 344 * <br/> 345 * This will lose any leader status, if held, and stop monitoring of the lock 346 * node. <br/> 347 * If the instance wants to participate in election again, then it needs to 348 * call joinElection(). <br/> 349 * This allows service instances to take themselves out of rotation for known 350 * impending unavailable states (e.g. long GC pause or software upgrade). 351 * 352 * @param needFence true if the underlying daemon may need to be fenced 353 * if a failover occurs due to dropping out of the election. 354 */ 355 public synchronized void quitElection(boolean needFence) { 356 LOG.info("Yielding from election"); 357 if (!needFence && state == State.ACTIVE) { 358 // If active is gracefully going back to standby mode, remove 359 // our permanent znode so no one fences us. 360 tryDeleteOwnBreadCrumbNode(); 361 } 362 reset(); 363 wantToBeInElection = false; 364 } 365 366 /** 367 * Exception thrown when there is no active leader 368 */ 369 public static class ActiveNotFoundException extends Exception { 370 private static final long serialVersionUID = 3505396722342846462L; 371 } 372 373 /** 374 * get data set by the active leader 375 * 376 * @return data set by the active instance 377 * @throws ActiveNotFoundException 378 * when there is no active leader 379 * @throws KeeperException 380 * other zookeeper operation errors 381 * @throws InterruptedException 382 * @throws IOException 383 * when ZooKeeper connection could not be established 384 */ 385 public synchronized byte[] getActiveData() throws ActiveNotFoundException, 386 KeeperException, InterruptedException, IOException { 387 try { 388 if (zkClient == null) { 389 createConnection(); 390 } 391 Stat stat = new Stat(); 392 return getDataWithRetries(zkLockFilePath, false, stat); 393 } catch(KeeperException e) { 394 Code code = e.code(); 395 if (isNodeDoesNotExist(code)) { 396 // handle the commonly expected cases that make sense for us 397 throw new ActiveNotFoundException(); 398 } else { 399 throw e; 400 } 401 } 402 } 403 404 /** 405 * interface implementation of Zookeeper callback for create 406 */ 407 @Override 408 public synchronized void processResult(int rc, String path, Object ctx, 409 String name) { 410 if (isStaleClient(ctx)) return; 411 LOG.debug("CreateNode result: " + rc + " for path: " + path 412 + " connectionState: " + zkConnectionState + 413 " for " + this); 414 415 Code code = Code.get(rc); 416 if (isSuccess(code)) { 417 // we successfully created the znode. we are the leader. start monitoring 418 if (becomeActive()) { 419 monitorActiveStatus(); 420 } else { 421 reJoinElectionAfterFailureToBecomeActive(); 422 } 423 return; 424 } 425 426 if (isNodeExists(code)) { 427 if (createRetryCount == 0) { 428 // znode exists and we did not retry the operation. so a different 429 // instance has created it. become standby and monitor lock. 430 becomeStandby(); 431 } 432 // if we had retried then the znode could have been created by our first 433 // attempt to the server (that we lost) and this node exists response is 434 // for the second attempt. verify this case via ephemeral node owner. this 435 // will happen on the callback for monitoring the lock. 436 monitorActiveStatus(); 437 return; 438 } 439 440 String errorMessage = "Received create error from Zookeeper. code:" 441 + code.toString() + " for path " + path; 442 LOG.debug(errorMessage); 443 444 if (shouldRetry(code)) { 445 if (createRetryCount < maxRetryNum) { 446 LOG.debug("Retrying createNode createRetryCount: " + createRetryCount); 447 ++createRetryCount; 448 createLockNodeAsync(); 449 return; 450 } 451 errorMessage = errorMessage 452 + ". Not retrying further znode create connection errors."; 453 } else if (isSessionExpired(code)) { 454 // This isn't fatal - the client Watcher will re-join the election 455 LOG.warn("Lock acquisition failed because session was lost"); 456 return; 457 } 458 459 fatalError(errorMessage); 460 } 461 462 /** 463 * interface implementation of Zookeeper callback for monitor (exists) 464 */ 465 @Override 466 public synchronized void processResult(int rc, String path, Object ctx, 467 Stat stat) { 468 if (isStaleClient(ctx)) return; 469 monitorLockNodePending = false; 470 471 assert wantToBeInElection : 472 "Got a StatNode result after quitting election"; 473 474 LOG.debug("StatNode result: " + rc + " for path: " + path 475 + " connectionState: " + zkConnectionState + " for " + this); 476 477 478 Code code = Code.get(rc); 479 if (isSuccess(code)) { 480 // the following owner check completes verification in case the lock znode 481 // creation was retried 482 if (stat.getEphemeralOwner() == zkClient.getSessionId()) { 483 // we own the lock znode. so we are the leader 484 if (!becomeActive()) { 485 reJoinElectionAfterFailureToBecomeActive(); 486 } 487 } else { 488 // we dont own the lock znode. so we are a standby. 489 becomeStandby(); 490 } 491 // the watch set by us will notify about changes 492 return; 493 } 494 495 if (isNodeDoesNotExist(code)) { 496 // the lock znode disappeared before we started monitoring it 497 enterNeutralMode(); 498 joinElectionInternal(); 499 return; 500 } 501 502 String errorMessage = "Received stat error from Zookeeper. code:" 503 + code.toString(); 504 LOG.debug(errorMessage); 505 506 if (shouldRetry(code)) { 507 if (statRetryCount < maxRetryNum) { 508 ++statRetryCount; 509 monitorLockNodeAsync(); 510 return; 511 } 512 errorMessage = errorMessage 513 + ". Not retrying further znode monitoring connection errors."; 514 } else if (isSessionExpired(code)) { 515 // This isn't fatal - the client Watcher will re-join the election 516 LOG.warn("Lock monitoring failed because session was lost"); 517 return; 518 } 519 520 fatalError(errorMessage); 521 } 522 523 /** 524 * We failed to become active. Re-join the election, but 525 * sleep for a few seconds after terminating our existing 526 * session, so that other nodes have a chance to become active. 527 * The failure to become active is already logged inside 528 * becomeActive(). 529 */ 530 private void reJoinElectionAfterFailureToBecomeActive() { 531 reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE); 532 } 533 534 /** 535 * interface implementation of Zookeeper watch events (connection and node), 536 * proxied by {@link WatcherWithClientRef}. 537 */ 538 synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) { 539 Event.EventType eventType = event.getType(); 540 if (isStaleClient(zk)) return; 541 LOG.debug("Watcher event type: " + eventType + " with state:" 542 + event.getState() + " for path:" + event.getPath() 543 + " connectionState: " + zkConnectionState 544 + " for " + this); 545 546 if (eventType == Event.EventType.None) { 547 // the connection state has changed 548 switch (event.getState()) { 549 case SyncConnected: 550 LOG.info("Session connected."); 551 // if the listener was asked to move to safe state then it needs to 552 // be undone 553 ConnectionState prevConnectionState = zkConnectionState; 554 zkConnectionState = ConnectionState.CONNECTED; 555 if (prevConnectionState == ConnectionState.DISCONNECTED && 556 wantToBeInElection) { 557 monitorActiveStatus(); 558 } 559 break; 560 case Disconnected: 561 LOG.info("Session disconnected. Entering neutral mode..."); 562 563 // ask the app to move to safe state because zookeeper connection 564 // is not active and we dont know our state 565 zkConnectionState = ConnectionState.DISCONNECTED; 566 enterNeutralMode(); 567 break; 568 case Expired: 569 // the connection got terminated because of session timeout 570 // call listener to reconnect 571 LOG.info("Session expired. Entering neutral mode and rejoining..."); 572 enterNeutralMode(); 573 reJoinElection(0); 574 break; 575 case SaslAuthenticated: 576 LOG.info("Successfully authenticated to ZooKeeper using SASL."); 577 break; 578 default: 579 fatalError("Unexpected Zookeeper watch event state: " 580 + event.getState()); 581 break; 582 } 583 584 return; 585 } 586 587 // a watch on lock path in zookeeper has fired. so something has changed on 588 // the lock. ideally we should check that the path is the same as the lock 589 // path but trusting zookeeper for now 590 String path = event.getPath(); 591 if (path != null) { 592 switch (eventType) { 593 case NodeDeleted: 594 if (state == State.ACTIVE) { 595 enterNeutralMode(); 596 } 597 joinElectionInternal(); 598 break; 599 case NodeDataChanged: 600 monitorActiveStatus(); 601 break; 602 default: 603 LOG.debug("Unexpected node event: " + eventType + " for path: " + path); 604 monitorActiveStatus(); 605 } 606 607 return; 608 } 609 610 // some unexpected error has occurred 611 fatalError("Unexpected watch error from Zookeeper"); 612 } 613 614 /** 615 * Get a new zookeeper client instance. protected so that test class can 616 * inherit and mock out the zookeeper instance 617 * 618 * @return new zookeeper client instance 619 * @throws IOException 620 * @throws KeeperException zookeeper connectionloss exception 621 */ 622 protected synchronized ZooKeeper connectToZooKeeper() throws IOException, 623 KeeperException { 624 625 // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and 626 // may trigger the Connected event immediately. So, if we register the 627 // watcher after constructing ZooKeeper, we may miss that event. Instead, 628 // we construct the watcher first, and have it block any events it receives 629 // before we can set its ZooKeeper reference. 630 watcher = new WatcherWithClientRef(); 631 ZooKeeper zk = createZooKeeper(); 632 watcher.setZooKeeperRef(zk); 633 634 // Wait for the asynchronous success/failure. This may throw an exception 635 // if we don't connect within the session timeout. 636 watcher.waitForZKConnectionEvent(zkSessionTimeout); 637 638 for (ZKAuthInfo auth : zkAuthInfo) { 639 zk.addAuthInfo(auth.getScheme(), auth.getAuth()); 640 } 641 return zk; 642 } 643 644 /** 645 * Get a new zookeeper client instance. protected so that test class can 646 * inherit and pass in a mock object for zookeeper 647 * 648 * @return new zookeeper client instance 649 * @throws IOException 650 */ 651 protected ZooKeeper createZooKeeper() throws IOException { 652 return new ZooKeeper(zkHostPort, zkSessionTimeout, watcher); 653 } 654 655 private void fatalError(String errorMessage) { 656 LOG.fatal(errorMessage); 657 reset(); 658 appClient.notifyFatalError(errorMessage); 659 } 660 661 private void monitorActiveStatus() { 662 assert wantToBeInElection; 663 LOG.debug("Monitoring active leader for " + this); 664 statRetryCount = 0; 665 monitorLockNodeAsync(); 666 } 667 668 private void joinElectionInternal() { 669 Preconditions.checkState(appData != null, 670 "trying to join election without any app data"); 671 if (zkClient == null) { 672 if (!reEstablishSession()) { 673 fatalError("Failed to reEstablish connection with ZooKeeper"); 674 return; 675 } 676 } 677 678 createRetryCount = 0; 679 wantToBeInElection = true; 680 createLockNodeAsync(); 681 } 682 683 private void reJoinElection(int sleepTime) { 684 LOG.info("Trying to re-establish ZK session"); 685 686 // Some of the test cases rely on expiring the ZK sessions and 687 // ensuring that the other node takes over. But, there's a race 688 // where the original lease holder could reconnect faster than the other 689 // thread manages to take the lock itself. This lock allows the 690 // tests to block the reconnection. It's a shame that this leaked 691 // into non-test code, but the lock is only acquired here so will never 692 // be contended. 693 sessionReestablishLockForTests.lock(); 694 try { 695 terminateConnection(); 696 sleepFor(sleepTime); 697 // Should not join election even before the SERVICE is reported 698 // as HEALTHY from ZKFC monitoring. 699 if (appData != null) { 700 joinElectionInternal(); 701 } else { 702 LOG.info("Not joining election since service has not yet been " + 703 "reported as healthy."); 704 } 705 } finally { 706 sessionReestablishLockForTests.unlock(); 707 } 708 } 709 710 /** 711 * Sleep for the given number of milliseconds. 712 * This is non-static, and separated out, so that unit tests 713 * can override the behavior not to sleep. 714 */ 715 @VisibleForTesting 716 protected void sleepFor(int sleepMs) { 717 if (sleepMs > 0) { 718 try { 719 Thread.sleep(sleepMs); 720 } catch (InterruptedException e) { 721 Thread.currentThread().interrupt(); 722 } 723 } 724 } 725 726 @VisibleForTesting 727 void preventSessionReestablishmentForTests() { 728 sessionReestablishLockForTests.lock(); 729 } 730 731 @VisibleForTesting 732 void allowSessionReestablishmentForTests() { 733 sessionReestablishLockForTests.unlock(); 734 } 735 736 @VisibleForTesting 737 synchronized long getZKSessionIdForTests() { 738 if (zkClient != null) { 739 return zkClient.getSessionId(); 740 } else { 741 return -1; 742 } 743 } 744 745 @VisibleForTesting 746 synchronized State getStateForTests() { 747 return state; 748 } 749 750 @VisibleForTesting 751 synchronized boolean isMonitorLockNodePending() { 752 return monitorLockNodePending; 753 } 754 755 private boolean reEstablishSession() { 756 int connectionRetryCount = 0; 757 boolean success = false; 758 while(!success && connectionRetryCount < maxRetryNum) { 759 LOG.debug("Establishing zookeeper connection for " + this); 760 try { 761 createConnection(); 762 success = true; 763 } catch(IOException e) { 764 LOG.warn(e); 765 sleepFor(5000); 766 } catch(KeeperException e) { 767 LOG.warn(e); 768 sleepFor(5000); 769 } 770 ++connectionRetryCount; 771 } 772 return success; 773 } 774 775 private void createConnection() throws IOException, KeeperException { 776 if (zkClient != null) { 777 try { 778 zkClient.close(); 779 } catch (InterruptedException e) { 780 throw new IOException("Interrupted while closing ZK", 781 e); 782 } 783 zkClient = null; 784 watcher = null; 785 } 786 zkClient = connectToZooKeeper(); 787 LOG.debug("Created new connection for " + this); 788 } 789 790 @InterfaceAudience.Private 791 public synchronized void terminateConnection() { 792 if (zkClient == null) { 793 return; 794 } 795 LOG.debug("Terminating ZK connection for " + this); 796 ZooKeeper tempZk = zkClient; 797 zkClient = null; 798 watcher = null; 799 try { 800 tempZk.close(); 801 } catch(InterruptedException e) { 802 LOG.warn(e); 803 } 804 zkConnectionState = ConnectionState.TERMINATED; 805 wantToBeInElection = false; 806 } 807 808 private void reset() { 809 state = State.INIT; 810 terminateConnection(); 811 } 812 813 private boolean becomeActive() { 814 assert wantToBeInElection; 815 if (state == State.ACTIVE) { 816 // already active 817 return true; 818 } 819 try { 820 Stat oldBreadcrumbStat = fenceOldActive(); 821 writeBreadCrumbNode(oldBreadcrumbStat); 822 823 LOG.debug("Becoming active for " + this); 824 appClient.becomeActive(); 825 state = State.ACTIVE; 826 return true; 827 } catch (Exception e) { 828 LOG.warn("Exception handling the winning of election", e); 829 // Caller will handle quitting and rejoining the election. 830 return false; 831 } 832 } 833 834 /** 835 * Write the "ActiveBreadCrumb" node, indicating that this node may need 836 * to be fenced on failover. 837 * @param oldBreadcrumbStat 838 */ 839 private void writeBreadCrumbNode(Stat oldBreadcrumbStat) 840 throws KeeperException, InterruptedException { 841 Preconditions.checkState(appData != null, "no appdata"); 842 843 LOG.info("Writing znode " + zkBreadCrumbPath + 844 " to indicate that the local node is the most recent active..."); 845 if (oldBreadcrumbStat == null) { 846 // No previous active, just create the node 847 createWithRetries(zkBreadCrumbPath, appData, zkAcl, 848 CreateMode.PERSISTENT); 849 } else { 850 // There was a previous active, update the node 851 setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion()); 852 } 853 } 854 855 /** 856 * Try to delete the "ActiveBreadCrumb" node when gracefully giving up 857 * active status. 858 * If this fails, it will simply warn, since the graceful release behavior 859 * is only an optimization. 860 */ 861 private void tryDeleteOwnBreadCrumbNode() { 862 assert state == State.ACTIVE; 863 LOG.info("Deleting bread-crumb of active node..."); 864 865 // Sanity check the data. This shouldn't be strictly necessary, 866 // but better to play it safe. 867 Stat stat = new Stat(); 868 byte[] data = null; 869 try { 870 data = zkClient.getData(zkBreadCrumbPath, false, stat); 871 872 if (!Arrays.equals(data, appData)) { 873 throw new IllegalStateException( 874 "We thought we were active, but in fact " + 875 "the active znode had the wrong data: " + 876 StringUtils.byteToHexString(data) + " (stat=" + stat + ")"); 877 } 878 879 deleteWithRetries(zkBreadCrumbPath, stat.getVersion()); 880 } catch (Exception e) { 881 LOG.warn("Unable to delete our own bread-crumb of being active at " + 882 zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " + 883 "Expecting to be fenced by the next active."); 884 } 885 } 886 887 /** 888 * If there is a breadcrumb node indicating that another node may need 889 * fencing, try to fence that node. 890 * @return the Stat of the breadcrumb node that was read, or null 891 * if no breadcrumb node existed 892 */ 893 private Stat fenceOldActive() throws InterruptedException, KeeperException { 894 final Stat stat = new Stat(); 895 byte[] data; 896 LOG.info("Checking for any old active which needs to be fenced..."); 897 try { 898 data = zkDoWithRetries(new ZKAction<byte[]>() { 899 @Override 900 public byte[] run() throws KeeperException, InterruptedException { 901 return zkClient.getData(zkBreadCrumbPath, false, stat); 902 } 903 }); 904 } catch (KeeperException ke) { 905 if (isNodeDoesNotExist(ke.code())) { 906 LOG.info("No old node to fence"); 907 return null; 908 } 909 910 // If we failed to read for any other reason, then likely we lost 911 // our session, or we don't have permissions, etc. In any case, 912 // we probably shouldn't become active, and failing the whole 913 // thing is the best bet. 914 throw ke; 915 } 916 917 LOG.info("Old node exists: " + StringUtils.byteToHexString(data)); 918 if (Arrays.equals(data, appData)) { 919 LOG.info("But old node has our own data, so don't need to fence it."); 920 } else { 921 appClient.fenceOldActive(data); 922 } 923 return stat; 924 } 925 926 private void becomeStandby() { 927 if (state != State.STANDBY) { 928 LOG.debug("Becoming standby for " + this); 929 state = State.STANDBY; 930 appClient.becomeStandby(); 931 } 932 } 933 934 private void enterNeutralMode() { 935 if (state != State.NEUTRAL) { 936 LOG.debug("Entering neutral mode for " + this); 937 state = State.NEUTRAL; 938 appClient.enterNeutralMode(); 939 } 940 } 941 942 private void createLockNodeAsync() { 943 zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL, 944 this, zkClient); 945 } 946 947 private void monitorLockNodeAsync() { 948 if (monitorLockNodePending && monitorLockNodeClient == zkClient) { 949 LOG.info("Ignore duplicate monitor lock-node request."); 950 return; 951 } 952 monitorLockNodePending = true; 953 monitorLockNodeClient = zkClient; 954 zkClient.exists(zkLockFilePath, 955 watcher, this, 956 zkClient); 957 } 958 959 private String createWithRetries(final String path, final byte[] data, 960 final List<ACL> acl, final CreateMode mode) 961 throws InterruptedException, KeeperException { 962 return zkDoWithRetries(new ZKAction<String>() { 963 @Override 964 public String run() throws KeeperException, InterruptedException { 965 return zkClient.create(path, data, acl, mode); 966 } 967 }); 968 } 969 970 private byte[] getDataWithRetries(final String path, final boolean watch, 971 final Stat stat) throws InterruptedException, KeeperException { 972 return zkDoWithRetries(new ZKAction<byte[]>() { 973 @Override 974 public byte[] run() throws KeeperException, InterruptedException { 975 return zkClient.getData(path, watch, stat); 976 } 977 }); 978 } 979 980 private Stat setDataWithRetries(final String path, final byte[] data, 981 final int version) throws InterruptedException, KeeperException { 982 return zkDoWithRetries(new ZKAction<Stat>() { 983 @Override 984 public Stat run() throws KeeperException, InterruptedException { 985 return zkClient.setData(path, data, version); 986 } 987 }); 988 } 989 990 private void deleteWithRetries(final String path, final int version) 991 throws KeeperException, InterruptedException { 992 zkDoWithRetries(new ZKAction<Void>() { 993 @Override 994 public Void run() throws KeeperException, InterruptedException { 995 zkClient.delete(path, version); 996 return null; 997 } 998 }); 999 } 1000 1001 private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException, 1002 InterruptedException { 1003 int retry = 0; 1004 while (true) { 1005 try { 1006 return action.run(); 1007 } catch (KeeperException ke) { 1008 if (shouldRetry(ke.code()) && ++retry < maxRetryNum) { 1009 continue; 1010 } 1011 throw ke; 1012 } 1013 } 1014 } 1015 1016 private interface ZKAction<T> { 1017 T run() throws KeeperException, InterruptedException; 1018 } 1019 1020 /** 1021 * The callbacks and watchers pass a reference to the ZK client 1022 * which made the original call. We don't want to take action 1023 * based on any callbacks from prior clients after we quit 1024 * the election. 1025 * @param ctx the ZK client passed into the watcher 1026 * @return true if it matches the current client 1027 */ 1028 private synchronized boolean isStaleClient(Object ctx) { 1029 Preconditions.checkNotNull(ctx); 1030 if (zkClient != (ZooKeeper)ctx) { 1031 LOG.warn("Ignoring stale result from old client with sessionId " + 1032 String.format("0x%08x", ((ZooKeeper)ctx).getSessionId())); 1033 return true; 1034 } 1035 return false; 1036 } 1037 1038 /** 1039 * Watcher implementation which keeps a reference around to the 1040 * original ZK connection, and passes it back along with any 1041 * events. 1042 */ 1043 private final class WatcherWithClientRef implements Watcher { 1044 private ZooKeeper zk; 1045 1046 /** 1047 * Latch fired whenever any event arrives. This is used in order 1048 * to wait for the Connected event when the client is first created. 1049 */ 1050 private CountDownLatch hasReceivedEvent = new CountDownLatch(1); 1051 1052 /** 1053 * Latch used to wait until the reference to ZooKeeper is set. 1054 */ 1055 private CountDownLatch hasSetZooKeeper = new CountDownLatch(1); 1056 1057 /** 1058 * Waits for the next event from ZooKeeper to arrive. 1059 * 1060 * @param connectionTimeoutMs zookeeper connection timeout in milliseconds 1061 * @throws KeeperException if the connection attempt times out. This will 1062 * be a ZooKeeper ConnectionLoss exception code. 1063 * @throws IOException if interrupted while connecting to ZooKeeper 1064 */ 1065 private void waitForZKConnectionEvent(int connectionTimeoutMs) 1066 throws KeeperException, IOException { 1067 try { 1068 if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) { 1069 LOG.error("Connection timed out: couldn't connect to ZooKeeper in " 1070 + connectionTimeoutMs + " milliseconds"); 1071 zk.close(); 1072 throw KeeperException.create(Code.CONNECTIONLOSS); 1073 } 1074 } catch (InterruptedException e) { 1075 Thread.currentThread().interrupt(); 1076 throw new IOException( 1077 "Interrupted when connecting to zookeeper server", e); 1078 } 1079 } 1080 1081 private void setZooKeeperRef(ZooKeeper zk) { 1082 Preconditions.checkState(this.zk == null, 1083 "zk already set -- must be set exactly once"); 1084 this.zk = zk; 1085 hasSetZooKeeper.countDown(); 1086 } 1087 1088 @Override 1089 public void process(WatchedEvent event) { 1090 hasReceivedEvent.countDown(); 1091 try { 1092 if (!hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS)) { 1093 LOG.debug("Event received with stale zk"); 1094 } 1095 ActiveStandbyElector.this.processWatchEvent( 1096 zk, event); 1097 } catch (Throwable t) { 1098 fatalError( 1099 "Failed to process watcher event " + event + ": " + 1100 StringUtils.stringifyException(t)); 1101 } 1102 } 1103 } 1104 1105 private static boolean isSuccess(Code code) { 1106 return (code == Code.OK); 1107 } 1108 1109 private static boolean isNodeExists(Code code) { 1110 return (code == Code.NODEEXISTS); 1111 } 1112 1113 private static boolean isNodeDoesNotExist(Code code) { 1114 return (code == Code.NONODE); 1115 } 1116 1117 private static boolean isSessionExpired(Code code) { 1118 return (code == Code.SESSIONEXPIRED); 1119 } 1120 1121 private static boolean shouldRetry(Code code) { 1122 return code == Code.CONNECTIONLOSS || code == Code.OPERATIONTIMEOUT; 1123 } 1124 1125 @Override 1126 public String toString() { 1127 return "elector id=" + System.identityHashCode(this) + 1128 " appData=" + 1129 ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 1130 " cb=" + appClient; 1131 } 1132 1133 public String getHAZookeeperConnectionState() { 1134 return this.zkConnectionState.name(); 1135 } 1136}