001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.ha; 020 021import java.io.IOException; 022import java.util.Arrays; 023import java.util.List; 024import java.util.concurrent.CountDownLatch; 025import java.util.concurrent.TimeUnit; 026import java.util.concurrent.locks.Lock; 027import java.util.concurrent.locks.ReentrantLock; 028 029import org.apache.commons.logging.Log; 030import org.apache.commons.logging.LogFactory; 031import org.apache.hadoop.HadoopIllegalArgumentException; 032import org.apache.hadoop.classification.InterfaceAudience; 033import org.apache.hadoop.classification.InterfaceStability; 034import org.apache.hadoop.util.ZKUtil.ZKAuthInfo; 035import org.apache.hadoop.util.StringUtils; 036import org.apache.zookeeper.data.ACL; 037import org.apache.zookeeper.KeeperException; 038import org.apache.zookeeper.Watcher; 039import org.apache.zookeeper.WatchedEvent; 040import org.apache.zookeeper.Watcher.Event; 041import org.apache.zookeeper.ZKUtil; 042import org.apache.zookeeper.ZooKeeper; 043import org.apache.zookeeper.CreateMode; 044import org.apache.zookeeper.AsyncCallback.*; 045import org.apache.zookeeper.data.Stat; 046import org.apache.zookeeper.KeeperException.Code; 047 048import com.google.common.annotations.VisibleForTesting; 049import com.google.common.base.Preconditions; 050 051/** 052 * 053 * This class implements a simple library to perform leader election on top of 054 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election 055 * can be performed by atomically creating an ephemeral lock file (znode) on 056 * Zookeeper. The service instance that successfully creates the znode becomes 057 * active and the rest become standbys. <br/> 058 * This election mechanism is only efficient for small number of election 059 * candidates (order of 10's) because contention on single znode by a large 060 * number of candidates can result in Zookeeper overload. <br/> 061 * The elector does not guarantee fencing (protection of shared resources) among 062 * service instances. After it has notified an instance about becoming a leader, 063 * then that instance must ensure that it meets the service consistency 064 * requirements. If it cannot do so, then it is recommended to quit the 065 * election. The application implements the {@link ActiveStandbyElectorCallback} 066 * to interact with the elector 067 */ 068@InterfaceAudience.Private 069@InterfaceStability.Evolving 070public class ActiveStandbyElector implements StatCallback, StringCallback { 071 072 /** 073 * Callback interface to interact with the ActiveStandbyElector object. <br/> 074 * The application will be notified with a callback only on state changes 075 * (i.e. there will never be successive calls to becomeActive without an 076 * intermediate call to enterNeutralMode). <br/> 077 * The callbacks will be running on Zookeeper client library threads. The 078 * application should return from these callbacks quickly so as not to impede 079 * Zookeeper client library performance and notifications. The app will 080 * typically remember the state change and return from the callback. It will 081 * then proceed with implementing actions around that state change. It is 082 * possible to be called back again while these actions are in flight and the 083 * app should handle this scenario. 084 */ 085 public interface ActiveStandbyElectorCallback { 086 /** 087 * This method is called when the app becomes the active leader. 088 * If the service fails to become active, it should throw 089 * ServiceFailedException. This will cause the elector to 090 * sleep for a short period, then re-join the election. 091 * 092 * Callback implementations are expected to manage their own 093 * timeouts (e.g. when making an RPC to a remote node). 094 */ 095 void becomeActive() throws ServiceFailedException; 096 097 /** 098 * This method is called when the app becomes a standby 099 */ 100 void becomeStandby(); 101 102 /** 103 * If the elector gets disconnected from Zookeeper and does not know about 104 * the lock state, then it will notify the service via the enterNeutralMode 105 * interface. The service may choose to ignore this or stop doing state 106 * changing operations. Upon reconnection, the elector verifies the leader 107 * status and calls back on the becomeActive and becomeStandby app 108 * interfaces. <br/> 109 * Zookeeper disconnects can happen due to network issues or loss of 110 * Zookeeper quorum. Thus enterNeutralMode can be used to guard against 111 * split-brain issues. In such situations it might be prudent to call 112 * becomeStandby too. However, such state change operations might be 113 * expensive and enterNeutralMode can help guard against doing that for 114 * transient issues. 115 */ 116 void enterNeutralMode(); 117 118 /** 119 * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper 120 * errors or Zookeeper persistent unavailability) then notifyFatalError is 121 * called to notify the app about it. 122 */ 123 void notifyFatalError(String errorMessage); 124 125 /** 126 * If an old active has failed, rather than exited gracefully, then 127 * the new active may need to take some fencing actions against it 128 * before proceeding with failover. 129 * 130 * @param oldActiveData the application data provided by the prior active 131 */ 132 void fenceOldActive(byte[] oldActiveData); 133 } 134 135 /** 136 * Name of the lock znode used by the library. Protected for access in test 137 * classes 138 */ 139 @VisibleForTesting 140 protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock"; 141 @VisibleForTesting 142 protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb"; 143 144 public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class); 145 146 private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000; 147 148 private static enum ConnectionState { 149 DISCONNECTED, CONNECTED, TERMINATED 150 }; 151 152 static enum State { 153 INIT, ACTIVE, STANDBY, NEUTRAL 154 }; 155 156 private State state = State.INIT; 157 private int createRetryCount = 0; 158 private int statRetryCount = 0; 159 private ZooKeeper zkClient; 160 private WatcherWithClientRef watcher; 161 private ConnectionState zkConnectionState = ConnectionState.TERMINATED; 162 163 private final ActiveStandbyElectorCallback appClient; 164 private final String zkHostPort; 165 private final int zkSessionTimeout; 166 private final List<ACL> zkAcl; 167 private final List<ZKAuthInfo> zkAuthInfo; 168 private byte[] appData; 169 private final String zkLockFilePath; 170 private final String zkBreadCrumbPath; 171 private final String znodeWorkingDir; 172 private final int maxRetryNum; 173 174 private Lock sessionReestablishLockForTests = new ReentrantLock(); 175 private boolean wantToBeInElection; 176 private boolean monitorLockNodePending = false; 177 private ZooKeeper monitorLockNodeClient; 178 179 /** 180 * Create a new ActiveStandbyElector object <br/> 181 * The elector is created by providing to it the Zookeeper configuration, the 182 * parent znode under which to create the znode and a reference to the 183 * callback interface. <br/> 184 * The parent znode name must be the same for all service instances and 185 * different across services. <br/> 186 * After the leader has been lost, a new leader will be elected after the 187 * session timeout expires. Hence, the app must set this parameter based on 188 * its needs for failure response time. The session timeout must be greater 189 * than the Zookeeper disconnect timeout and is recommended to be 3X that 190 * value to enable Zookeeper to retry transient disconnections. Setting a very 191 * short session timeout may result in frequent transitions between active and 192 * standby states during issues like network outages/GS pauses. 193 * 194 * @param zookeeperHostPorts 195 * ZooKeeper hostPort for all ZooKeeper servers 196 * @param zookeeperSessionTimeout 197 * ZooKeeper session timeout 198 * @param parentZnodeName 199 * znode under which to create the lock 200 * @param acl 201 * ZooKeeper ACL's 202 * @param authInfo a list of authentication credentials to add to the 203 * ZK connection 204 * @param app 205 * reference to callback interface object 206 * @throws IOException 207 * @throws HadoopIllegalArgumentException 208 */ 209 public ActiveStandbyElector(String zookeeperHostPorts, 210 int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl, 211 List<ZKAuthInfo> authInfo, 212 ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException, 213 HadoopIllegalArgumentException, KeeperException { 214 if (app == null || acl == null || parentZnodeName == null 215 || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { 216 throw new HadoopIllegalArgumentException("Invalid argument"); 217 } 218 zkHostPort = zookeeperHostPorts; 219 zkSessionTimeout = zookeeperSessionTimeout; 220 zkAcl = acl; 221 zkAuthInfo = authInfo; 222 appClient = app; 223 znodeWorkingDir = parentZnodeName; 224 zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME; 225 zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; 226 this.maxRetryNum = maxRetryNum; 227 228 // createConnection for future API calls 229 createConnection(); 230 } 231 232 /** 233 * To participate in election, the app will call joinElection. The result will 234 * be notified by a callback on either the becomeActive or becomeStandby app 235 * interfaces. <br/> 236 * After this the elector will automatically monitor the leader status and 237 * perform re-election if necessary<br/> 238 * The app could potentially start off in standby mode and ignore the 239 * becomeStandby call. 240 * 241 * @param data 242 * to be set by the app. non-null data must be set. 243 * @throws HadoopIllegalArgumentException 244 * if valid data is not supplied 245 */ 246 public synchronized void joinElection(byte[] data) 247 throws HadoopIllegalArgumentException { 248 249 if (data == null) { 250 throw new HadoopIllegalArgumentException("data cannot be null"); 251 } 252 253 if (wantToBeInElection) { 254 LOG.info("Already in election. Not re-connecting."); 255 return; 256 } 257 258 appData = new byte[data.length]; 259 System.arraycopy(data, 0, appData, 0, data.length); 260 261 LOG.debug("Attempting active election for " + this); 262 joinElectionInternal(); 263 } 264 265 /** 266 * @return true if the configured parent znode exists 267 */ 268 public synchronized boolean parentZNodeExists() 269 throws IOException, InterruptedException { 270 Preconditions.checkState(zkClient != null); 271 try { 272 return zkClient.exists(znodeWorkingDir, false) != null; 273 } catch (KeeperException e) { 274 throw new IOException("Couldn't determine existence of znode '" + 275 znodeWorkingDir + "'", e); 276 } 277 } 278 279 /** 280 * Utility function to ensure that the configured base znode exists. 281 * This recursively creates the znode as well as all of its parents. 282 */ 283 public synchronized void ensureParentZNode() 284 throws IOException, InterruptedException { 285 Preconditions.checkState(!wantToBeInElection, 286 "ensureParentZNode() may not be called while in the election"); 287 288 String pathParts[] = znodeWorkingDir.split("/"); 289 Preconditions.checkArgument(pathParts.length >= 1 && 290 pathParts[0].isEmpty(), 291 "Invalid path: %s", znodeWorkingDir); 292 293 StringBuilder sb = new StringBuilder(); 294 for (int i = 1; i < pathParts.length; i++) { 295 sb.append("/").append(pathParts[i]); 296 String prefixPath = sb.toString(); 297 LOG.debug("Ensuring existence of " + prefixPath); 298 try { 299 createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT); 300 } catch (KeeperException e) { 301 if (isNodeExists(e.code())) { 302 // This is OK - just ensuring existence. 303 continue; 304 } else { 305 throw new IOException("Couldn't create " + prefixPath, e); 306 } 307 } 308 } 309 310 LOG.info("Successfully created " + znodeWorkingDir + " in ZK."); 311 } 312 313 /** 314 * Clear all of the state held within the parent ZNode. 315 * This recursively deletes everything within the znode as well as the 316 * parent znode itself. It should only be used when it's certain that 317 * no electors are currently participating in the election. 318 */ 319 public synchronized void clearParentZNode() 320 throws IOException, InterruptedException { 321 Preconditions.checkState(!wantToBeInElection, 322 "clearParentZNode() may not be called while in the election"); 323 324 try { 325 LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK..."); 326 327 zkDoWithRetries(new ZKAction<Void>() { 328 @Override 329 public Void run() throws KeeperException, InterruptedException { 330 ZKUtil.deleteRecursive(zkClient, znodeWorkingDir); 331 return null; 332 } 333 }); 334 } catch (KeeperException e) { 335 throw new IOException("Couldn't clear parent znode " + znodeWorkingDir, 336 e); 337 } 338 LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK."); 339 } 340 341 342 /** 343 * Any service instance can drop out of the election by calling quitElection. 344 * <br/> 345 * This will lose any leader status, if held, and stop monitoring of the lock 346 * node. <br/> 347 * If the instance wants to participate in election again, then it needs to 348 * call joinElection(). <br/> 349 * This allows service instances to take themselves out of rotation for known 350 * impending unavailable states (e.g. long GC pause or software upgrade). 351 * 352 * @param needFence true if the underlying daemon may need to be fenced 353 * if a failover occurs due to dropping out of the election. 354 */ 355 public synchronized void quitElection(boolean needFence) { 356 LOG.info("Yielding from election"); 357 if (!needFence && state == State.ACTIVE) { 358 // If active is gracefully going back to standby mode, remove 359 // our permanent znode so no one fences us. 360 tryDeleteOwnBreadCrumbNode(); 361 } 362 reset(); 363 wantToBeInElection = false; 364 } 365 366 /** 367 * Exception thrown when there is no active leader 368 */ 369 public static class ActiveNotFoundException extends Exception { 370 private static final long serialVersionUID = 3505396722342846462L; 371 } 372 373 /** 374 * get data set by the active leader 375 * 376 * @return data set by the active instance 377 * @throws ActiveNotFoundException 378 * when there is no active leader 379 * @throws KeeperException 380 * other zookeeper operation errors 381 * @throws InterruptedException 382 * @throws IOException 383 * when ZooKeeper connection could not be established 384 */ 385 public synchronized byte[] getActiveData() throws ActiveNotFoundException, 386 KeeperException, InterruptedException, IOException { 387 try { 388 if (zkClient == null) { 389 createConnection(); 390 } 391 Stat stat = new Stat(); 392 return getDataWithRetries(zkLockFilePath, false, stat); 393 } catch(KeeperException e) { 394 Code code = e.code(); 395 if (isNodeDoesNotExist(code)) { 396 // handle the commonly expected cases that make sense for us 397 throw new ActiveNotFoundException(); 398 } else { 399 throw e; 400 } 401 } 402 } 403 404 /** 405 * interface implementation of Zookeeper callback for create 406 */ 407 @Override 408 public synchronized void processResult(int rc, String path, Object ctx, 409 String name) { 410 if (isStaleClient(ctx)) return; 411 LOG.debug("CreateNode result: " + rc + " for path: " + path 412 + " connectionState: " + zkConnectionState + 413 " for " + this); 414 415 Code code = Code.get(rc); 416 if (isSuccess(code)) { 417 // we successfully created the znode. we are the leader. start monitoring 418 if (becomeActive()) { 419 monitorActiveStatus(); 420 } else { 421 reJoinElectionAfterFailureToBecomeActive(); 422 } 423 return; 424 } 425 426 if (isNodeExists(code)) { 427 if (createRetryCount == 0) { 428 // znode exists and we did not retry the operation. so a different 429 // instance has created it. become standby and monitor lock. 430 becomeStandby(); 431 } 432 // if we had retried then the znode could have been created by our first 433 // attempt to the server (that we lost) and this node exists response is 434 // for the second attempt. verify this case via ephemeral node owner. this 435 // will happen on the callback for monitoring the lock. 436 monitorActiveStatus(); 437 return; 438 } 439 440 String errorMessage = "Received create error from Zookeeper. code:" 441 + code.toString() + " for path " + path; 442 LOG.debug(errorMessage); 443 444 if (shouldRetry(code)) { 445 if (createRetryCount < maxRetryNum) { 446 LOG.debug("Retrying createNode createRetryCount: " + createRetryCount); 447 ++createRetryCount; 448 createLockNodeAsync(); 449 return; 450 } 451 errorMessage = errorMessage 452 + ". Not retrying further znode create connection errors."; 453 } else if (isSessionExpired(code)) { 454 // This isn't fatal - the client Watcher will re-join the election 455 LOG.warn("Lock acquisition failed because session was lost"); 456 return; 457 } 458 459 fatalError(errorMessage); 460 } 461 462 /** 463 * interface implementation of Zookeeper callback for monitor (exists) 464 */ 465 @Override 466 public synchronized void processResult(int rc, String path, Object ctx, 467 Stat stat) { 468 if (isStaleClient(ctx)) return; 469 monitorLockNodePending = false; 470 471 assert wantToBeInElection : 472 "Got a StatNode result after quitting election"; 473 474 LOG.debug("StatNode result: " + rc + " for path: " + path 475 + " connectionState: " + zkConnectionState + " for " + this); 476 477 478 Code code = Code.get(rc); 479 if (isSuccess(code)) { 480 // the following owner check completes verification in case the lock znode 481 // creation was retried 482 if (stat.getEphemeralOwner() == zkClient.getSessionId()) { 483 // we own the lock znode. so we are the leader 484 if (!becomeActive()) { 485 reJoinElectionAfterFailureToBecomeActive(); 486 } 487 } else { 488 // we dont own the lock znode. so we are a standby. 489 becomeStandby(); 490 } 491 // the watch set by us will notify about changes 492 return; 493 } 494 495 if (isNodeDoesNotExist(code)) { 496 // the lock znode disappeared before we started monitoring it 497 enterNeutralMode(); 498 joinElectionInternal(); 499 return; 500 } 501 502 String errorMessage = "Received stat error from Zookeeper. code:" 503 + code.toString(); 504 LOG.debug(errorMessage); 505 506 if (shouldRetry(code)) { 507 if (statRetryCount < maxRetryNum) { 508 ++statRetryCount; 509 monitorLockNodeAsync(); 510 return; 511 } 512 errorMessage = errorMessage 513 + ". Not retrying further znode monitoring connection errors."; 514 } else if (isSessionExpired(code)) { 515 // This isn't fatal - the client Watcher will re-join the election 516 LOG.warn("Lock monitoring failed because session was lost"); 517 return; 518 } 519 520 fatalError(errorMessage); 521 } 522 523 /** 524 * We failed to become active. Re-join the election, but 525 * sleep for a few seconds after terminating our existing 526 * session, so that other nodes have a chance to become active. 527 * The failure to become active is already logged inside 528 * becomeActive(). 529 */ 530 private void reJoinElectionAfterFailureToBecomeActive() { 531 reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE); 532 } 533 534 /** 535 * interface implementation of Zookeeper watch events (connection and node), 536 * proxied by {@link WatcherWithClientRef}. 537 */ 538 synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) { 539 Event.EventType eventType = event.getType(); 540 if (isStaleClient(zk)) return; 541 LOG.debug("Watcher event type: " + eventType + " with state:" 542 + event.getState() + " for path:" + event.getPath() 543 + " connectionState: " + zkConnectionState 544 + " for " + this); 545 546 if (eventType == Event.EventType.None) { 547 // the connection state has changed 548 switch (event.getState()) { 549 case SyncConnected: 550 LOG.info("Session connected."); 551 // if the listener was asked to move to safe state then it needs to 552 // be undone 553 ConnectionState prevConnectionState = zkConnectionState; 554 zkConnectionState = ConnectionState.CONNECTED; 555 if (prevConnectionState == ConnectionState.DISCONNECTED && 556 wantToBeInElection) { 557 monitorActiveStatus(); 558 } 559 break; 560 case Disconnected: 561 LOG.info("Session disconnected. Entering neutral mode..."); 562 563 // ask the app to move to safe state because zookeeper connection 564 // is not active and we dont know our state 565 zkConnectionState = ConnectionState.DISCONNECTED; 566 enterNeutralMode(); 567 break; 568 case Expired: 569 // the connection got terminated because of session timeout 570 // call listener to reconnect 571 LOG.info("Session expired. Entering neutral mode and rejoining..."); 572 enterNeutralMode(); 573 reJoinElection(0); 574 break; 575 case SaslAuthenticated: 576 LOG.info("Successfully authenticated to ZooKeeper using SASL."); 577 break; 578 default: 579 fatalError("Unexpected Zookeeper watch event state: " 580 + event.getState()); 581 break; 582 } 583 584 return; 585 } 586 587 // a watch on lock path in zookeeper has fired. so something has changed on 588 // the lock. ideally we should check that the path is the same as the lock 589 // path but trusting zookeeper for now 590 String path = event.getPath(); 591 if (path != null) { 592 switch (eventType) { 593 case NodeDeleted: 594 if (state == State.ACTIVE) { 595 enterNeutralMode(); 596 } 597 joinElectionInternal(); 598 break; 599 case NodeDataChanged: 600 monitorActiveStatus(); 601 break; 602 default: 603 LOG.debug("Unexpected node event: " + eventType + " for path: " + path); 604 monitorActiveStatus(); 605 } 606 607 return; 608 } 609 610 // some unexpected error has occurred 611 fatalError("Unexpected watch error from Zookeeper"); 612 } 613 614 /** 615 * Get a new zookeeper client instance. protected so that test class can 616 * inherit and pass in a mock object for zookeeper 617 * 618 * @return new zookeeper client instance 619 * @throws IOException 620 * @throws KeeperException zookeeper connectionloss exception 621 */ 622 protected synchronized ZooKeeper getNewZooKeeper() throws IOException, 623 KeeperException { 624 625 // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and 626 // may trigger the Connected event immediately. So, if we register the 627 // watcher after constructing ZooKeeper, we may miss that event. Instead, 628 // we construct the watcher first, and have it block any events it receives 629 // before we can set its ZooKeeper reference. 630 watcher = new WatcherWithClientRef(); 631 ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher); 632 watcher.setZooKeeperRef(zk); 633 634 // Wait for the asynchronous success/failure. This may throw an exception 635 // if we don't connect within the session timeout. 636 watcher.waitForZKConnectionEvent(zkSessionTimeout); 637 638 for (ZKAuthInfo auth : zkAuthInfo) { 639 zk.addAuthInfo(auth.getScheme(), auth.getAuth()); 640 } 641 return zk; 642 } 643 644 private void fatalError(String errorMessage) { 645 LOG.fatal(errorMessage); 646 reset(); 647 appClient.notifyFatalError(errorMessage); 648 } 649 650 private void monitorActiveStatus() { 651 assert wantToBeInElection; 652 LOG.debug("Monitoring active leader for " + this); 653 statRetryCount = 0; 654 monitorLockNodeAsync(); 655 } 656 657 private void joinElectionInternal() { 658 Preconditions.checkState(appData != null, 659 "trying to join election without any app data"); 660 if (zkClient == null) { 661 if (!reEstablishSession()) { 662 fatalError("Failed to reEstablish connection with ZooKeeper"); 663 return; 664 } 665 } 666 667 createRetryCount = 0; 668 wantToBeInElection = true; 669 createLockNodeAsync(); 670 } 671 672 private void reJoinElection(int sleepTime) { 673 LOG.info("Trying to re-establish ZK session"); 674 675 // Some of the test cases rely on expiring the ZK sessions and 676 // ensuring that the other node takes over. But, there's a race 677 // where the original lease holder could reconnect faster than the other 678 // thread manages to take the lock itself. This lock allows the 679 // tests to block the reconnection. It's a shame that this leaked 680 // into non-test code, but the lock is only acquired here so will never 681 // be contended. 682 sessionReestablishLockForTests.lock(); 683 try { 684 terminateConnection(); 685 sleepFor(sleepTime); 686 // Should not join election even before the SERVICE is reported 687 // as HEALTHY from ZKFC monitoring. 688 if (appData != null) { 689 joinElectionInternal(); 690 } else { 691 LOG.info("Not joining election since service has not yet been " + 692 "reported as healthy."); 693 } 694 } finally { 695 sessionReestablishLockForTests.unlock(); 696 } 697 } 698 699 /** 700 * Sleep for the given number of milliseconds. 701 * This is non-static, and separated out, so that unit tests 702 * can override the behavior not to sleep. 703 */ 704 @VisibleForTesting 705 protected void sleepFor(int sleepMs) { 706 if (sleepMs > 0) { 707 try { 708 Thread.sleep(sleepMs); 709 } catch (InterruptedException e) { 710 Thread.currentThread().interrupt(); 711 } 712 } 713 } 714 715 @VisibleForTesting 716 void preventSessionReestablishmentForTests() { 717 sessionReestablishLockForTests.lock(); 718 } 719 720 @VisibleForTesting 721 void allowSessionReestablishmentForTests() { 722 sessionReestablishLockForTests.unlock(); 723 } 724 725 @VisibleForTesting 726 synchronized long getZKSessionIdForTests() { 727 if (zkClient != null) { 728 return zkClient.getSessionId(); 729 } else { 730 return -1; 731 } 732 } 733 734 @VisibleForTesting 735 synchronized State getStateForTests() { 736 return state; 737 } 738 739 @VisibleForTesting 740 synchronized boolean isMonitorLockNodePending() { 741 return monitorLockNodePending; 742 } 743 744 private boolean reEstablishSession() { 745 int connectionRetryCount = 0; 746 boolean success = false; 747 while(!success && connectionRetryCount < maxRetryNum) { 748 LOG.debug("Establishing zookeeper connection for " + this); 749 try { 750 createConnection(); 751 success = true; 752 } catch(IOException e) { 753 LOG.warn(e); 754 sleepFor(5000); 755 } catch(KeeperException e) { 756 LOG.warn(e); 757 sleepFor(5000); 758 } 759 ++connectionRetryCount; 760 } 761 return success; 762 } 763 764 private void createConnection() throws IOException, KeeperException { 765 if (zkClient != null) { 766 try { 767 zkClient.close(); 768 } catch (InterruptedException e) { 769 throw new IOException("Interrupted while closing ZK", 770 e); 771 } 772 zkClient = null; 773 watcher = null; 774 } 775 zkClient = getNewZooKeeper(); 776 LOG.debug("Created new connection for " + this); 777 } 778 779 @InterfaceAudience.Private 780 public synchronized void terminateConnection() { 781 if (zkClient == null) { 782 return; 783 } 784 LOG.debug("Terminating ZK connection for " + this); 785 ZooKeeper tempZk = zkClient; 786 zkClient = null; 787 watcher = null; 788 try { 789 tempZk.close(); 790 } catch(InterruptedException e) { 791 LOG.warn(e); 792 } 793 zkConnectionState = ConnectionState.TERMINATED; 794 wantToBeInElection = false; 795 } 796 797 private void reset() { 798 state = State.INIT; 799 terminateConnection(); 800 } 801 802 private boolean becomeActive() { 803 assert wantToBeInElection; 804 if (state == State.ACTIVE) { 805 // already active 806 return true; 807 } 808 try { 809 Stat oldBreadcrumbStat = fenceOldActive(); 810 writeBreadCrumbNode(oldBreadcrumbStat); 811 812 LOG.debug("Becoming active for " + this); 813 appClient.becomeActive(); 814 state = State.ACTIVE; 815 return true; 816 } catch (Exception e) { 817 LOG.warn("Exception handling the winning of election", e); 818 // Caller will handle quitting and rejoining the election. 819 return false; 820 } 821 } 822 823 /** 824 * Write the "ActiveBreadCrumb" node, indicating that this node may need 825 * to be fenced on failover. 826 * @param oldBreadcrumbStat 827 */ 828 private void writeBreadCrumbNode(Stat oldBreadcrumbStat) 829 throws KeeperException, InterruptedException { 830 Preconditions.checkState(appData != null, "no appdata"); 831 832 LOG.info("Writing znode " + zkBreadCrumbPath + 833 " to indicate that the local node is the most recent active..."); 834 if (oldBreadcrumbStat == null) { 835 // No previous active, just create the node 836 createWithRetries(zkBreadCrumbPath, appData, zkAcl, 837 CreateMode.PERSISTENT); 838 } else { 839 // There was a previous active, update the node 840 setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion()); 841 } 842 } 843 844 /** 845 * Try to delete the "ActiveBreadCrumb" node when gracefully giving up 846 * active status. 847 * If this fails, it will simply warn, since the graceful release behavior 848 * is only an optimization. 849 */ 850 private void tryDeleteOwnBreadCrumbNode() { 851 assert state == State.ACTIVE; 852 LOG.info("Deleting bread-crumb of active node..."); 853 854 // Sanity check the data. This shouldn't be strictly necessary, 855 // but better to play it safe. 856 Stat stat = new Stat(); 857 byte[] data = null; 858 try { 859 data = zkClient.getData(zkBreadCrumbPath, false, stat); 860 861 if (!Arrays.equals(data, appData)) { 862 throw new IllegalStateException( 863 "We thought we were active, but in fact " + 864 "the active znode had the wrong data: " + 865 StringUtils.byteToHexString(data) + " (stat=" + stat + ")"); 866 } 867 868 deleteWithRetries(zkBreadCrumbPath, stat.getVersion()); 869 } catch (Exception e) { 870 LOG.warn("Unable to delete our own bread-crumb of being active at " + 871 zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " + 872 "Expecting to be fenced by the next active."); 873 } 874 } 875 876 /** 877 * If there is a breadcrumb node indicating that another node may need 878 * fencing, try to fence that node. 879 * @return the Stat of the breadcrumb node that was read, or null 880 * if no breadcrumb node existed 881 */ 882 private Stat fenceOldActive() throws InterruptedException, KeeperException { 883 final Stat stat = new Stat(); 884 byte[] data; 885 LOG.info("Checking for any old active which needs to be fenced..."); 886 try { 887 data = zkDoWithRetries(new ZKAction<byte[]>() { 888 @Override 889 public byte[] run() throws KeeperException, InterruptedException { 890 return zkClient.getData(zkBreadCrumbPath, false, stat); 891 } 892 }); 893 } catch (KeeperException ke) { 894 if (isNodeDoesNotExist(ke.code())) { 895 LOG.info("No old node to fence"); 896 return null; 897 } 898 899 // If we failed to read for any other reason, then likely we lost 900 // our session, or we don't have permissions, etc. In any case, 901 // we probably shouldn't become active, and failing the whole 902 // thing is the best bet. 903 throw ke; 904 } 905 906 LOG.info("Old node exists: " + StringUtils.byteToHexString(data)); 907 if (Arrays.equals(data, appData)) { 908 LOG.info("But old node has our own data, so don't need to fence it."); 909 } else { 910 appClient.fenceOldActive(data); 911 } 912 return stat; 913 } 914 915 private void becomeStandby() { 916 if (state != State.STANDBY) { 917 LOG.debug("Becoming standby for " + this); 918 state = State.STANDBY; 919 appClient.becomeStandby(); 920 } 921 } 922 923 private void enterNeutralMode() { 924 if (state != State.NEUTRAL) { 925 LOG.debug("Entering neutral mode for " + this); 926 state = State.NEUTRAL; 927 appClient.enterNeutralMode(); 928 } 929 } 930 931 private void createLockNodeAsync() { 932 zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL, 933 this, zkClient); 934 } 935 936 private void monitorLockNodeAsync() { 937 if (monitorLockNodePending && monitorLockNodeClient == zkClient) { 938 LOG.info("Ignore duplicate monitor lock-node request."); 939 return; 940 } 941 monitorLockNodePending = true; 942 monitorLockNodeClient = zkClient; 943 zkClient.exists(zkLockFilePath, 944 watcher, this, 945 zkClient); 946 } 947 948 private String createWithRetries(final String path, final byte[] data, 949 final List<ACL> acl, final CreateMode mode) 950 throws InterruptedException, KeeperException { 951 return zkDoWithRetries(new ZKAction<String>() { 952 @Override 953 public String run() throws KeeperException, InterruptedException { 954 return zkClient.create(path, data, acl, mode); 955 } 956 }); 957 } 958 959 private byte[] getDataWithRetries(final String path, final boolean watch, 960 final Stat stat) throws InterruptedException, KeeperException { 961 return zkDoWithRetries(new ZKAction<byte[]>() { 962 @Override 963 public byte[] run() throws KeeperException, InterruptedException { 964 return zkClient.getData(path, watch, stat); 965 } 966 }); 967 } 968 969 private Stat setDataWithRetries(final String path, final byte[] data, 970 final int version) throws InterruptedException, KeeperException { 971 return zkDoWithRetries(new ZKAction<Stat>() { 972 @Override 973 public Stat run() throws KeeperException, InterruptedException { 974 return zkClient.setData(path, data, version); 975 } 976 }); 977 } 978 979 private void deleteWithRetries(final String path, final int version) 980 throws KeeperException, InterruptedException { 981 zkDoWithRetries(new ZKAction<Void>() { 982 @Override 983 public Void run() throws KeeperException, InterruptedException { 984 zkClient.delete(path, version); 985 return null; 986 } 987 }); 988 } 989 990 private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException, 991 InterruptedException { 992 int retry = 0; 993 while (true) { 994 try { 995 return action.run(); 996 } catch (KeeperException ke) { 997 if (shouldRetry(ke.code()) && ++retry < maxRetryNum) { 998 continue; 999 } 1000 throw ke; 1001 } 1002 } 1003 } 1004 1005 private interface ZKAction<T> { 1006 T run() throws KeeperException, InterruptedException; 1007 } 1008 1009 /** 1010 * The callbacks and watchers pass a reference to the ZK client 1011 * which made the original call. We don't want to take action 1012 * based on any callbacks from prior clients after we quit 1013 * the election. 1014 * @param ctx the ZK client passed into the watcher 1015 * @return true if it matches the current client 1016 */ 1017 private synchronized boolean isStaleClient(Object ctx) { 1018 Preconditions.checkNotNull(ctx); 1019 if (zkClient != (ZooKeeper)ctx) { 1020 LOG.warn("Ignoring stale result from old client with sessionId " + 1021 String.format("0x%08x", ((ZooKeeper)ctx).getSessionId())); 1022 return true; 1023 } 1024 return false; 1025 } 1026 1027 /** 1028 * Watcher implementation which keeps a reference around to the 1029 * original ZK connection, and passes it back along with any 1030 * events. 1031 */ 1032 private final class WatcherWithClientRef implements Watcher { 1033 private ZooKeeper zk; 1034 1035 /** 1036 * Latch fired whenever any event arrives. This is used in order 1037 * to wait for the Connected event when the client is first created. 1038 */ 1039 private CountDownLatch hasReceivedEvent = new CountDownLatch(1); 1040 1041 /** 1042 * Latch used to wait until the reference to ZooKeeper is set. 1043 */ 1044 private CountDownLatch hasSetZooKeeper = new CountDownLatch(1); 1045 1046 /** 1047 * Waits for the next event from ZooKeeper to arrive. 1048 * 1049 * @param connectionTimeoutMs zookeeper connection timeout in milliseconds 1050 * @throws KeeperException if the connection attempt times out. This will 1051 * be a ZooKeeper ConnectionLoss exception code. 1052 * @throws IOException if interrupted while connecting to ZooKeeper 1053 */ 1054 private void waitForZKConnectionEvent(int connectionTimeoutMs) 1055 throws KeeperException, IOException { 1056 try { 1057 if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) { 1058 LOG.error("Connection timed out: couldn't connect to ZooKeeper in " 1059 + connectionTimeoutMs + " milliseconds"); 1060 zk.close(); 1061 throw KeeperException.create(Code.CONNECTIONLOSS); 1062 } 1063 } catch (InterruptedException e) { 1064 Thread.currentThread().interrupt(); 1065 throw new IOException( 1066 "Interrupted when connecting to zookeeper server", e); 1067 } 1068 } 1069 1070 private void setZooKeeperRef(ZooKeeper zk) { 1071 Preconditions.checkState(this.zk == null, 1072 "zk already set -- must be set exactly once"); 1073 this.zk = zk; 1074 hasSetZooKeeper.countDown(); 1075 } 1076 1077 @Override 1078 public void process(WatchedEvent event) { 1079 hasReceivedEvent.countDown(); 1080 try { 1081 if (!hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS)) { 1082 LOG.debug("Event received with stale zk"); 1083 } 1084 ActiveStandbyElector.this.processWatchEvent( 1085 zk, event); 1086 } catch (Throwable t) { 1087 fatalError( 1088 "Failed to process watcher event " + event + ": " + 1089 StringUtils.stringifyException(t)); 1090 } 1091 } 1092 } 1093 1094 private static boolean isSuccess(Code code) { 1095 return (code == Code.OK); 1096 } 1097 1098 private static boolean isNodeExists(Code code) { 1099 return (code == Code.NODEEXISTS); 1100 } 1101 1102 private static boolean isNodeDoesNotExist(Code code) { 1103 return (code == Code.NONODE); 1104 } 1105 1106 private static boolean isSessionExpired(Code code) { 1107 return (code == Code.SESSIONEXPIRED); 1108 } 1109 1110 private static boolean shouldRetry(Code code) { 1111 return code == Code.CONNECTIONLOSS || code == Code.OPERATIONTIMEOUT; 1112 } 1113 1114 @Override 1115 public String toString() { 1116 return "elector id=" + System.identityHashCode(this) + 1117 " appData=" + 1118 ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 1119 " cb=" + appClient; 1120 } 1121 1122 public String getHAZookeeperConnectionState() { 1123 return this.zkConnectionState.name(); 1124 } 1125}