001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.ha; 020 021 import java.io.IOException; 022 import java.util.Arrays; 023 import java.util.List; 024 import java.util.concurrent.CountDownLatch; 025 import java.util.concurrent.TimeUnit; 026 import java.util.concurrent.locks.Lock; 027 import java.util.concurrent.locks.ReentrantLock; 028 029 import org.apache.commons.logging.Log; 030 import org.apache.commons.logging.LogFactory; 031 import org.apache.hadoop.HadoopIllegalArgumentException; 032 import org.apache.hadoop.classification.InterfaceAudience; 033 import org.apache.hadoop.classification.InterfaceStability; 034 import org.apache.hadoop.util.ZKUtil.ZKAuthInfo; 035 import org.apache.hadoop.util.StringUtils; 036 import org.apache.zookeeper.data.ACL; 037 import org.apache.zookeeper.KeeperException; 038 import org.apache.zookeeper.Watcher; 039 import org.apache.zookeeper.WatchedEvent; 040 import org.apache.zookeeper.Watcher.Event; 041 import org.apache.zookeeper.ZKUtil; 042 import org.apache.zookeeper.ZooKeeper; 043 import org.apache.zookeeper.CreateMode; 044 import org.apache.zookeeper.AsyncCallback.*; 045 import org.apache.zookeeper.data.Stat; 046 import org.apache.zookeeper.KeeperException.Code; 047 048 import com.google.common.annotations.VisibleForTesting; 049 import com.google.common.base.Preconditions; 050 051 /** 052 * 053 * This class implements a simple library to perform leader election on top of 054 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election 055 * can be performed by atomically creating an ephemeral lock file (znode) on 056 * Zookeeper. The service instance that successfully creates the znode becomes 057 * active and the rest become standbys. <br/> 058 * This election mechanism is only efficient for small number of election 059 * candidates (order of 10's) because contention on single znode by a large 060 * number of candidates can result in Zookeeper overload. <br/> 061 * The elector does not guarantee fencing (protection of shared resources) among 062 * service instances. After it has notified an instance about becoming a leader, 063 * then that instance must ensure that it meets the service consistency 064 * requirements. If it cannot do so, then it is recommended to quit the 065 * election. The application implements the {@link ActiveStandbyElectorCallback} 066 * to interact with the elector 067 */ 068 @InterfaceAudience.Private 069 @InterfaceStability.Evolving 070 public class ActiveStandbyElector implements StatCallback, StringCallback { 071 072 /** 073 * Callback interface to interact with the ActiveStandbyElector object. <br/> 074 * The application will be notified with a callback only on state changes 075 * (i.e. there will never be successive calls to becomeActive without an 076 * intermediate call to enterNeutralMode). <br/> 077 * The callbacks will be running on Zookeeper client library threads. The 078 * application should return from these callbacks quickly so as not to impede 079 * Zookeeper client library performance and notifications. The app will 080 * typically remember the state change and return from the callback. It will 081 * then proceed with implementing actions around that state change. It is 082 * possible to be called back again while these actions are in flight and the 083 * app should handle this scenario. 084 */ 085 public interface ActiveStandbyElectorCallback { 086 /** 087 * This method is called when the app becomes the active leader. 088 * If the service fails to become active, it should throw 089 * ServiceFailedException. This will cause the elector to 090 * sleep for a short period, then re-join the election. 091 * 092 * Callback implementations are expected to manage their own 093 * timeouts (e.g. when making an RPC to a remote node). 094 */ 095 void becomeActive() throws ServiceFailedException; 096 097 /** 098 * This method is called when the app becomes a standby 099 */ 100 void becomeStandby(); 101 102 /** 103 * If the elector gets disconnected from Zookeeper and does not know about 104 * the lock state, then it will notify the service via the enterNeutralMode 105 * interface. The service may choose to ignore this or stop doing state 106 * changing operations. Upon reconnection, the elector verifies the leader 107 * status and calls back on the becomeActive and becomeStandby app 108 * interfaces. <br/> 109 * Zookeeper disconnects can happen due to network issues or loss of 110 * Zookeeper quorum. Thus enterNeutralMode can be used to guard against 111 * split-brain issues. In such situations it might be prudent to call 112 * becomeStandby too. However, such state change operations might be 113 * expensive and enterNeutralMode can help guard against doing that for 114 * transient issues. 115 */ 116 void enterNeutralMode(); 117 118 /** 119 * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper 120 * errors or Zookeeper persistent unavailability) then notifyFatalError is 121 * called to notify the app about it. 122 */ 123 void notifyFatalError(String errorMessage); 124 125 /** 126 * If an old active has failed, rather than exited gracefully, then 127 * the new active may need to take some fencing actions against it 128 * before proceeding with failover. 129 * 130 * @param oldActiveData the application data provided by the prior active 131 */ 132 void fenceOldActive(byte[] oldActiveData); 133 } 134 135 /** 136 * Name of the lock znode used by the library. Protected for access in test 137 * classes 138 */ 139 @VisibleForTesting 140 protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock"; 141 @VisibleForTesting 142 protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb"; 143 144 public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class); 145 146 static int NUM_RETRIES = 3; 147 private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000; 148 149 private static enum ConnectionState { 150 DISCONNECTED, CONNECTED, TERMINATED 151 }; 152 153 static enum State { 154 INIT, ACTIVE, STANDBY, NEUTRAL 155 }; 156 157 private State state = State.INIT; 158 private int createRetryCount = 0; 159 private int statRetryCount = 0; 160 private ZooKeeper zkClient; 161 private WatcherWithClientRef watcher; 162 private ConnectionState zkConnectionState = ConnectionState.TERMINATED; 163 164 private final ActiveStandbyElectorCallback appClient; 165 private final String zkHostPort; 166 private final int zkSessionTimeout; 167 private final List<ACL> zkAcl; 168 private final List<ZKAuthInfo> zkAuthInfo; 169 private byte[] appData; 170 private final String zkLockFilePath; 171 private final String zkBreadCrumbPath; 172 private final String znodeWorkingDir; 173 174 private Lock sessionReestablishLockForTests = new ReentrantLock(); 175 private boolean wantToBeInElection; 176 177 /** 178 * Create a new ActiveStandbyElector object <br/> 179 * The elector is created by providing to it the Zookeeper configuration, the 180 * parent znode under which to create the znode and a reference to the 181 * callback interface. <br/> 182 * The parent znode name must be the same for all service instances and 183 * different across services. <br/> 184 * After the leader has been lost, a new leader will be elected after the 185 * session timeout expires. Hence, the app must set this parameter based on 186 * its needs for failure response time. The session timeout must be greater 187 * than the Zookeeper disconnect timeout and is recommended to be 3X that 188 * value to enable Zookeeper to retry transient disconnections. Setting a very 189 * short session timeout may result in frequent transitions between active and 190 * standby states during issues like network outages/GS pauses. 191 * 192 * @param zookeeperHostPorts 193 * ZooKeeper hostPort for all ZooKeeper servers 194 * @param zookeeperSessionTimeout 195 * ZooKeeper session timeout 196 * @param parentZnodeName 197 * znode under which to create the lock 198 * @param acl 199 * ZooKeeper ACL's 200 * @param authInfo a list of authentication credentials to add to the 201 * ZK connection 202 * @param app 203 * reference to callback interface object 204 * @throws IOException 205 * @throws HadoopIllegalArgumentException 206 */ 207 public ActiveStandbyElector(String zookeeperHostPorts, 208 int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl, 209 List<ZKAuthInfo> authInfo, 210 ActiveStandbyElectorCallback app) throws IOException, 211 HadoopIllegalArgumentException, KeeperException { 212 if (app == null || acl == null || parentZnodeName == null 213 || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { 214 throw new HadoopIllegalArgumentException("Invalid argument"); 215 } 216 zkHostPort = zookeeperHostPorts; 217 zkSessionTimeout = zookeeperSessionTimeout; 218 zkAcl = acl; 219 zkAuthInfo = authInfo; 220 appClient = app; 221 znodeWorkingDir = parentZnodeName; 222 zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME; 223 zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; 224 225 // createConnection for future API calls 226 createConnection(); 227 } 228 229 /** 230 * To participate in election, the app will call joinElection. The result will 231 * be notified by a callback on either the becomeActive or becomeStandby app 232 * interfaces. <br/> 233 * After this the elector will automatically monitor the leader status and 234 * perform re-election if necessary<br/> 235 * The app could potentially start off in standby mode and ignore the 236 * becomeStandby call. 237 * 238 * @param data 239 * to be set by the app. non-null data must be set. 240 * @throws HadoopIllegalArgumentException 241 * if valid data is not supplied 242 */ 243 public synchronized void joinElection(byte[] data) 244 throws HadoopIllegalArgumentException { 245 246 if (data == null) { 247 throw new HadoopIllegalArgumentException("data cannot be null"); 248 } 249 250 if (wantToBeInElection) { 251 LOG.info("Already in election. Not re-connecting."); 252 return; 253 } 254 255 appData = new byte[data.length]; 256 System.arraycopy(data, 0, appData, 0, data.length); 257 258 LOG.debug("Attempting active election for " + this); 259 joinElectionInternal(); 260 } 261 262 /** 263 * @return true if the configured parent znode exists 264 */ 265 public synchronized boolean parentZNodeExists() 266 throws IOException, InterruptedException { 267 Preconditions.checkState(zkClient != null); 268 try { 269 return zkClient.exists(znodeWorkingDir, false) != null; 270 } catch (KeeperException e) { 271 throw new IOException("Couldn't determine existence of znode '" + 272 znodeWorkingDir + "'", e); 273 } 274 } 275 276 /** 277 * Utility function to ensure that the configured base znode exists. 278 * This recursively creates the znode as well as all of its parents. 279 */ 280 public synchronized void ensureParentZNode() 281 throws IOException, InterruptedException { 282 Preconditions.checkState(!wantToBeInElection, 283 "ensureParentZNode() may not be called while in the election"); 284 285 String pathParts[] = znodeWorkingDir.split("/"); 286 Preconditions.checkArgument(pathParts.length >= 1 && 287 pathParts[0].isEmpty(), 288 "Invalid path: %s", znodeWorkingDir); 289 290 StringBuilder sb = new StringBuilder(); 291 for (int i = 1; i < pathParts.length; i++) { 292 sb.append("/").append(pathParts[i]); 293 String prefixPath = sb.toString(); 294 LOG.debug("Ensuring existence of " + prefixPath); 295 try { 296 createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT); 297 } catch (KeeperException e) { 298 if (isNodeExists(e.code())) { 299 // This is OK - just ensuring existence. 300 continue; 301 } else { 302 throw new IOException("Couldn't create " + prefixPath, e); 303 } 304 } 305 } 306 307 LOG.info("Successfully created " + znodeWorkingDir + " in ZK."); 308 } 309 310 /** 311 * Clear all of the state held within the parent ZNode. 312 * This recursively deletes everything within the znode as well as the 313 * parent znode itself. It should only be used when it's certain that 314 * no electors are currently participating in the election. 315 */ 316 public synchronized void clearParentZNode() 317 throws IOException, InterruptedException { 318 Preconditions.checkState(!wantToBeInElection, 319 "clearParentZNode() may not be called while in the election"); 320 321 try { 322 LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK..."); 323 324 zkDoWithRetries(new ZKAction<Void>() { 325 @Override 326 public Void run() throws KeeperException, InterruptedException { 327 ZKUtil.deleteRecursive(zkClient, znodeWorkingDir); 328 return null; 329 } 330 }); 331 } catch (KeeperException e) { 332 throw new IOException("Couldn't clear parent znode " + znodeWorkingDir, 333 e); 334 } 335 LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK."); 336 } 337 338 339 /** 340 * Any service instance can drop out of the election by calling quitElection. 341 * <br/> 342 * This will lose any leader status, if held, and stop monitoring of the lock 343 * node. <br/> 344 * If the instance wants to participate in election again, then it needs to 345 * call joinElection(). <br/> 346 * This allows service instances to take themselves out of rotation for known 347 * impending unavailable states (e.g. long GC pause or software upgrade). 348 * 349 * @param needFence true if the underlying daemon may need to be fenced 350 * if a failover occurs due to dropping out of the election. 351 */ 352 public synchronized void quitElection(boolean needFence) { 353 LOG.info("Yielding from election"); 354 if (!needFence && state == State.ACTIVE) { 355 // If active is gracefully going back to standby mode, remove 356 // our permanent znode so no one fences us. 357 tryDeleteOwnBreadCrumbNode(); 358 } 359 reset(); 360 wantToBeInElection = false; 361 } 362 363 /** 364 * Exception thrown when there is no active leader 365 */ 366 public static class ActiveNotFoundException extends Exception { 367 private static final long serialVersionUID = 3505396722342846462L; 368 } 369 370 /** 371 * get data set by the active leader 372 * 373 * @return data set by the active instance 374 * @throws ActiveNotFoundException 375 * when there is no active leader 376 * @throws KeeperException 377 * other zookeeper operation errors 378 * @throws InterruptedException 379 * @throws IOException 380 * when ZooKeeper connection could not be established 381 */ 382 public synchronized byte[] getActiveData() throws ActiveNotFoundException, 383 KeeperException, InterruptedException, IOException { 384 try { 385 if (zkClient == null) { 386 createConnection(); 387 } 388 Stat stat = new Stat(); 389 return getDataWithRetries(zkLockFilePath, false, stat); 390 } catch(KeeperException e) { 391 Code code = e.code(); 392 if (isNodeDoesNotExist(code)) { 393 // handle the commonly expected cases that make sense for us 394 throw new ActiveNotFoundException(); 395 } else { 396 throw e; 397 } 398 } 399 } 400 401 /** 402 * interface implementation of Zookeeper callback for create 403 */ 404 @Override 405 public synchronized void processResult(int rc, String path, Object ctx, 406 String name) { 407 if (isStaleClient(ctx)) return; 408 LOG.debug("CreateNode result: " + rc + " for path: " + path 409 + " connectionState: " + zkConnectionState + 410 " for " + this); 411 412 Code code = Code.get(rc); 413 if (isSuccess(code)) { 414 // we successfully created the znode. we are the leader. start monitoring 415 if (becomeActive()) { 416 monitorActiveStatus(); 417 } else { 418 reJoinElectionAfterFailureToBecomeActive(); 419 } 420 return; 421 } 422 423 if (isNodeExists(code)) { 424 if (createRetryCount == 0) { 425 // znode exists and we did not retry the operation. so a different 426 // instance has created it. become standby and monitor lock. 427 becomeStandby(); 428 } 429 // if we had retried then the znode could have been created by our first 430 // attempt to the server (that we lost) and this node exists response is 431 // for the second attempt. verify this case via ephemeral node owner. this 432 // will happen on the callback for monitoring the lock. 433 monitorActiveStatus(); 434 return; 435 } 436 437 String errorMessage = "Received create error from Zookeeper. code:" 438 + code.toString() + " for path " + path; 439 LOG.debug(errorMessage); 440 441 if (shouldRetry(code)) { 442 if (createRetryCount < NUM_RETRIES) { 443 LOG.debug("Retrying createNode createRetryCount: " + createRetryCount); 444 ++createRetryCount; 445 createLockNodeAsync(); 446 return; 447 } 448 errorMessage = errorMessage 449 + ". Not retrying further znode create connection errors."; 450 } else if (isSessionExpired(code)) { 451 // This isn't fatal - the client Watcher will re-join the election 452 LOG.warn("Lock acquisition failed because session was lost"); 453 return; 454 } 455 456 fatalError(errorMessage); 457 } 458 459 /** 460 * interface implementation of Zookeeper callback for monitor (exists) 461 */ 462 @Override 463 public synchronized void processResult(int rc, String path, Object ctx, 464 Stat stat) { 465 if (isStaleClient(ctx)) return; 466 467 assert wantToBeInElection : 468 "Got a StatNode result after quitting election"; 469 470 LOG.debug("StatNode result: " + rc + " for path: " + path 471 + " connectionState: " + zkConnectionState + " for " + this); 472 473 474 Code code = Code.get(rc); 475 if (isSuccess(code)) { 476 // the following owner check completes verification in case the lock znode 477 // creation was retried 478 if (stat.getEphemeralOwner() == zkClient.getSessionId()) { 479 // we own the lock znode. so we are the leader 480 if (!becomeActive()) { 481 reJoinElectionAfterFailureToBecomeActive(); 482 } 483 } else { 484 // we dont own the lock znode. so we are a standby. 485 becomeStandby(); 486 } 487 // the watch set by us will notify about changes 488 return; 489 } 490 491 if (isNodeDoesNotExist(code)) { 492 // the lock znode disappeared before we started monitoring it 493 enterNeutralMode(); 494 joinElectionInternal(); 495 return; 496 } 497 498 String errorMessage = "Received stat error from Zookeeper. code:" 499 + code.toString(); 500 LOG.debug(errorMessage); 501 502 if (shouldRetry(code)) { 503 if (statRetryCount < NUM_RETRIES) { 504 ++statRetryCount; 505 monitorLockNodeAsync(); 506 return; 507 } 508 errorMessage = errorMessage 509 + ". Not retrying further znode monitoring connection errors."; 510 } else if (isSessionExpired(code)) { 511 // This isn't fatal - the client Watcher will re-join the election 512 LOG.warn("Lock monitoring failed because session was lost"); 513 return; 514 } 515 516 fatalError(errorMessage); 517 } 518 519 /** 520 * We failed to become active. Re-join the election, but 521 * sleep for a few seconds after terminating our existing 522 * session, so that other nodes have a chance to become active. 523 * The failure to become active is already logged inside 524 * becomeActive(). 525 */ 526 private void reJoinElectionAfterFailureToBecomeActive() { 527 reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE); 528 } 529 530 /** 531 * interface implementation of Zookeeper watch events (connection and node), 532 * proxied by {@link WatcherWithClientRef}. 533 */ 534 synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) { 535 Event.EventType eventType = event.getType(); 536 if (isStaleClient(zk)) return; 537 LOG.debug("Watcher event type: " + eventType + " with state:" 538 + event.getState() + " for path:" + event.getPath() 539 + " connectionState: " + zkConnectionState 540 + " for " + this); 541 542 if (eventType == Event.EventType.None) { 543 // the connection state has changed 544 switch (event.getState()) { 545 case SyncConnected: 546 LOG.info("Session connected."); 547 // if the listener was asked to move to safe state then it needs to 548 // be undone 549 ConnectionState prevConnectionState = zkConnectionState; 550 zkConnectionState = ConnectionState.CONNECTED; 551 if (prevConnectionState == ConnectionState.DISCONNECTED && 552 wantToBeInElection) { 553 monitorActiveStatus(); 554 } 555 break; 556 case Disconnected: 557 LOG.info("Session disconnected. Entering neutral mode..."); 558 559 // ask the app to move to safe state because zookeeper connection 560 // is not active and we dont know our state 561 zkConnectionState = ConnectionState.DISCONNECTED; 562 enterNeutralMode(); 563 break; 564 case Expired: 565 // the connection got terminated because of session timeout 566 // call listener to reconnect 567 LOG.info("Session expired. Entering neutral mode and rejoining..."); 568 enterNeutralMode(); 569 reJoinElection(0); 570 break; 571 default: 572 fatalError("Unexpected Zookeeper watch event state: " 573 + event.getState()); 574 break; 575 } 576 577 return; 578 } 579 580 // a watch on lock path in zookeeper has fired. so something has changed on 581 // the lock. ideally we should check that the path is the same as the lock 582 // path but trusting zookeeper for now 583 String path = event.getPath(); 584 if (path != null) { 585 switch (eventType) { 586 case NodeDeleted: 587 if (state == State.ACTIVE) { 588 enterNeutralMode(); 589 } 590 joinElectionInternal(); 591 break; 592 case NodeDataChanged: 593 monitorActiveStatus(); 594 break; 595 default: 596 LOG.debug("Unexpected node event: " + eventType + " for path: " + path); 597 monitorActiveStatus(); 598 } 599 600 return; 601 } 602 603 // some unexpected error has occurred 604 fatalError("Unexpected watch error from Zookeeper"); 605 } 606 607 /** 608 * Get a new zookeeper client instance. protected so that test class can 609 * inherit and pass in a mock object for zookeeper 610 * 611 * @return new zookeeper client instance 612 * @throws IOException 613 * @throws KeeperException zookeeper connectionloss exception 614 */ 615 protected synchronized ZooKeeper getNewZooKeeper() throws IOException, 616 KeeperException { 617 618 // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and 619 // may trigger the Connected event immediately. So, if we register the 620 // watcher after constructing ZooKeeper, we may miss that event. Instead, 621 // we construct the watcher first, and have it block any events it receives 622 // before we can set its ZooKeeper reference. 623 watcher = new WatcherWithClientRef(); 624 ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher); 625 watcher.setZooKeeperRef(zk); 626 627 // Wait for the asynchronous success/failure. This may throw an exception 628 // if we don't connect within the session timeout. 629 watcher.waitForZKConnectionEvent(zkSessionTimeout); 630 631 for (ZKAuthInfo auth : zkAuthInfo) { 632 zk.addAuthInfo(auth.getScheme(), auth.getAuth()); 633 } 634 return zk; 635 } 636 637 private void fatalError(String errorMessage) { 638 LOG.fatal(errorMessage); 639 reset(); 640 appClient.notifyFatalError(errorMessage); 641 } 642 643 private void monitorActiveStatus() { 644 assert wantToBeInElection; 645 LOG.debug("Monitoring active leader for " + this); 646 statRetryCount = 0; 647 monitorLockNodeAsync(); 648 } 649 650 private void joinElectionInternal() { 651 Preconditions.checkState(appData != null, 652 "trying to join election without any app data"); 653 if (zkClient == null) { 654 if (!reEstablishSession()) { 655 fatalError("Failed to reEstablish connection with ZooKeeper"); 656 return; 657 } 658 } 659 660 createRetryCount = 0; 661 wantToBeInElection = true; 662 createLockNodeAsync(); 663 } 664 665 private void reJoinElection(int sleepTime) { 666 LOG.info("Trying to re-establish ZK session"); 667 668 // Some of the test cases rely on expiring the ZK sessions and 669 // ensuring that the other node takes over. But, there's a race 670 // where the original lease holder could reconnect faster than the other 671 // thread manages to take the lock itself. This lock allows the 672 // tests to block the reconnection. It's a shame that this leaked 673 // into non-test code, but the lock is only acquired here so will never 674 // be contended. 675 sessionReestablishLockForTests.lock(); 676 try { 677 terminateConnection(); 678 sleepFor(sleepTime); 679 // Should not join election even before the SERVICE is reported 680 // as HEALTHY from ZKFC monitoring. 681 if (appData != null) { 682 joinElectionInternal(); 683 } else { 684 LOG.info("Not joining election since service has not yet been " + 685 "reported as healthy."); 686 } 687 } finally { 688 sessionReestablishLockForTests.unlock(); 689 } 690 } 691 692 /** 693 * Sleep for the given number of milliseconds. 694 * This is non-static, and separated out, so that unit tests 695 * can override the behavior not to sleep. 696 */ 697 @VisibleForTesting 698 protected void sleepFor(int sleepMs) { 699 if (sleepMs > 0) { 700 try { 701 Thread.sleep(sleepMs); 702 } catch (InterruptedException e) { 703 Thread.currentThread().interrupt(); 704 } 705 } 706 } 707 708 @VisibleForTesting 709 void preventSessionReestablishmentForTests() { 710 sessionReestablishLockForTests.lock(); 711 } 712 713 @VisibleForTesting 714 void allowSessionReestablishmentForTests() { 715 sessionReestablishLockForTests.unlock(); 716 } 717 718 @VisibleForTesting 719 synchronized long getZKSessionIdForTests() { 720 if (zkClient != null) { 721 return zkClient.getSessionId(); 722 } else { 723 return -1; 724 } 725 } 726 727 @VisibleForTesting 728 synchronized State getStateForTests() { 729 return state; 730 } 731 732 private boolean reEstablishSession() { 733 int connectionRetryCount = 0; 734 boolean success = false; 735 while(!success && connectionRetryCount < NUM_RETRIES) { 736 LOG.debug("Establishing zookeeper connection for " + this); 737 try { 738 createConnection(); 739 success = true; 740 } catch(IOException e) { 741 LOG.warn(e); 742 sleepFor(5000); 743 } catch(KeeperException e) { 744 LOG.warn(e); 745 sleepFor(5000); 746 } 747 ++connectionRetryCount; 748 } 749 return success; 750 } 751 752 private void createConnection() throws IOException, KeeperException { 753 if (zkClient != null) { 754 try { 755 zkClient.close(); 756 } catch (InterruptedException e) { 757 throw new IOException("Interrupted while closing ZK", 758 e); 759 } 760 zkClient = null; 761 watcher = null; 762 } 763 zkClient = getNewZooKeeper(); 764 LOG.debug("Created new connection for " + this); 765 } 766 767 void terminateConnection() { 768 if (zkClient == null) { 769 return; 770 } 771 LOG.debug("Terminating ZK connection for " + this); 772 ZooKeeper tempZk = zkClient; 773 zkClient = null; 774 watcher = null; 775 try { 776 tempZk.close(); 777 } catch(InterruptedException e) { 778 LOG.warn(e); 779 } 780 zkConnectionState = ConnectionState.TERMINATED; 781 wantToBeInElection = false; 782 } 783 784 private void reset() { 785 state = State.INIT; 786 terminateConnection(); 787 } 788 789 private boolean becomeActive() { 790 assert wantToBeInElection; 791 if (state == State.ACTIVE) { 792 // already active 793 return true; 794 } 795 try { 796 Stat oldBreadcrumbStat = fenceOldActive(); 797 writeBreadCrumbNode(oldBreadcrumbStat); 798 799 LOG.debug("Becoming active for " + this); 800 appClient.becomeActive(); 801 state = State.ACTIVE; 802 return true; 803 } catch (Exception e) { 804 LOG.warn("Exception handling the winning of election", e); 805 // Caller will handle quitting and rejoining the election. 806 return false; 807 } 808 } 809 810 /** 811 * Write the "ActiveBreadCrumb" node, indicating that this node may need 812 * to be fenced on failover. 813 * @param oldBreadcrumbStat 814 */ 815 private void writeBreadCrumbNode(Stat oldBreadcrumbStat) 816 throws KeeperException, InterruptedException { 817 Preconditions.checkState(appData != null, "no appdata"); 818 819 LOG.info("Writing znode " + zkBreadCrumbPath + 820 " to indicate that the local node is the most recent active..."); 821 if (oldBreadcrumbStat == null) { 822 // No previous active, just create the node 823 createWithRetries(zkBreadCrumbPath, appData, zkAcl, 824 CreateMode.PERSISTENT); 825 } else { 826 // There was a previous active, update the node 827 setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion()); 828 } 829 } 830 831 /** 832 * Try to delete the "ActiveBreadCrumb" node when gracefully giving up 833 * active status. 834 * If this fails, it will simply warn, since the graceful release behavior 835 * is only an optimization. 836 */ 837 private void tryDeleteOwnBreadCrumbNode() { 838 assert state == State.ACTIVE; 839 LOG.info("Deleting bread-crumb of active node..."); 840 841 // Sanity check the data. This shouldn't be strictly necessary, 842 // but better to play it safe. 843 Stat stat = new Stat(); 844 byte[] data = null; 845 try { 846 data = zkClient.getData(zkBreadCrumbPath, false, stat); 847 848 if (!Arrays.equals(data, appData)) { 849 throw new IllegalStateException( 850 "We thought we were active, but in fact " + 851 "the active znode had the wrong data: " + 852 StringUtils.byteToHexString(data) + " (stat=" + stat + ")"); 853 } 854 855 deleteWithRetries(zkBreadCrumbPath, stat.getVersion()); 856 } catch (Exception e) { 857 LOG.warn("Unable to delete our own bread-crumb of being active at " + 858 zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " + 859 "Expecting to be fenced by the next active."); 860 } 861 } 862 863 /** 864 * If there is a breadcrumb node indicating that another node may need 865 * fencing, try to fence that node. 866 * @return the Stat of the breadcrumb node that was read, or null 867 * if no breadcrumb node existed 868 */ 869 private Stat fenceOldActive() throws InterruptedException, KeeperException { 870 final Stat stat = new Stat(); 871 byte[] data; 872 LOG.info("Checking for any old active which needs to be fenced..."); 873 try { 874 data = zkDoWithRetries(new ZKAction<byte[]>() { 875 @Override 876 public byte[] run() throws KeeperException, InterruptedException { 877 return zkClient.getData(zkBreadCrumbPath, false, stat); 878 } 879 }); 880 } catch (KeeperException ke) { 881 if (isNodeDoesNotExist(ke.code())) { 882 LOG.info("No old node to fence"); 883 return null; 884 } 885 886 // If we failed to read for any other reason, then likely we lost 887 // our session, or we don't have permissions, etc. In any case, 888 // we probably shouldn't become active, and failing the whole 889 // thing is the best bet. 890 throw ke; 891 } 892 893 LOG.info("Old node exists: " + StringUtils.byteToHexString(data)); 894 if (Arrays.equals(data, appData)) { 895 LOG.info("But old node has our own data, so don't need to fence it."); 896 } else { 897 appClient.fenceOldActive(data); 898 } 899 return stat; 900 } 901 902 private void becomeStandby() { 903 if (state != State.STANDBY) { 904 LOG.debug("Becoming standby for " + this); 905 state = State.STANDBY; 906 appClient.becomeStandby(); 907 } 908 } 909 910 private void enterNeutralMode() { 911 if (state != State.NEUTRAL) { 912 LOG.debug("Entering neutral mode for " + this); 913 state = State.NEUTRAL; 914 appClient.enterNeutralMode(); 915 } 916 } 917 918 private void createLockNodeAsync() { 919 zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL, 920 this, zkClient); 921 } 922 923 private void monitorLockNodeAsync() { 924 zkClient.exists(zkLockFilePath, 925 watcher, this, 926 zkClient); 927 } 928 929 private String createWithRetries(final String path, final byte[] data, 930 final List<ACL> acl, final CreateMode mode) 931 throws InterruptedException, KeeperException { 932 return zkDoWithRetries(new ZKAction<String>() { 933 @Override 934 public String run() throws KeeperException, InterruptedException { 935 return zkClient.create(path, data, acl, mode); 936 } 937 }); 938 } 939 940 private byte[] getDataWithRetries(final String path, final boolean watch, 941 final Stat stat) throws InterruptedException, KeeperException { 942 return zkDoWithRetries(new ZKAction<byte[]>() { 943 @Override 944 public byte[] run() throws KeeperException, InterruptedException { 945 return zkClient.getData(path, watch, stat); 946 } 947 }); 948 } 949 950 private Stat setDataWithRetries(final String path, final byte[] data, 951 final int version) throws InterruptedException, KeeperException { 952 return zkDoWithRetries(new ZKAction<Stat>() { 953 @Override 954 public Stat run() throws KeeperException, InterruptedException { 955 return zkClient.setData(path, data, version); 956 } 957 }); 958 } 959 960 private void deleteWithRetries(final String path, final int version) 961 throws KeeperException, InterruptedException { 962 zkDoWithRetries(new ZKAction<Void>() { 963 @Override 964 public Void run() throws KeeperException, InterruptedException { 965 zkClient.delete(path, version); 966 return null; 967 } 968 }); 969 } 970 971 private static <T> T zkDoWithRetries(ZKAction<T> action) 972 throws KeeperException, InterruptedException { 973 int retry = 0; 974 while (true) { 975 try { 976 return action.run(); 977 } catch (KeeperException ke) { 978 if (shouldRetry(ke.code()) && ++retry < NUM_RETRIES) { 979 continue; 980 } 981 throw ke; 982 } 983 } 984 } 985 986 private interface ZKAction<T> { 987 T run() throws KeeperException, InterruptedException; 988 } 989 990 /** 991 * The callbacks and watchers pass a reference to the ZK client 992 * which made the original call. We don't want to take action 993 * based on any callbacks from prior clients after we quit 994 * the election. 995 * @param ctx the ZK client passed into the watcher 996 * @return true if it matches the current client 997 */ 998 private synchronized boolean isStaleClient(Object ctx) { 999 Preconditions.checkNotNull(ctx); 1000 if (zkClient != (ZooKeeper)ctx) { 1001 LOG.warn("Ignoring stale result from old client with sessionId " + 1002 String.format("0x%08x", ((ZooKeeper)ctx).getSessionId())); 1003 return true; 1004 } 1005 return false; 1006 } 1007 1008 /** 1009 * Watcher implementation which keeps a reference around to the 1010 * original ZK connection, and passes it back along with any 1011 * events. 1012 */ 1013 private final class WatcherWithClientRef implements Watcher { 1014 private ZooKeeper zk; 1015 1016 /** 1017 * Latch fired whenever any event arrives. This is used in order 1018 * to wait for the Connected event when the client is first created. 1019 */ 1020 private CountDownLatch hasReceivedEvent = new CountDownLatch(1); 1021 1022 /** 1023 * Latch used to wait until the reference to ZooKeeper is set. 1024 */ 1025 private CountDownLatch hasSetZooKeeper = new CountDownLatch(1); 1026 1027 /** 1028 * Waits for the next event from ZooKeeper to arrive. 1029 * 1030 * @param connectionTimeoutMs zookeeper connection timeout in milliseconds 1031 * @throws KeeperException if the connection attempt times out. This will 1032 * be a ZooKeeper ConnectionLoss exception code. 1033 * @throws IOException if interrupted while connecting to ZooKeeper 1034 */ 1035 private void waitForZKConnectionEvent(int connectionTimeoutMs) 1036 throws KeeperException, IOException { 1037 try { 1038 if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) { 1039 LOG.error("Connection timed out: couldn't connect to ZooKeeper in " 1040 + connectionTimeoutMs + " milliseconds"); 1041 zk.close(); 1042 throw KeeperException.create(Code.CONNECTIONLOSS); 1043 } 1044 } catch (InterruptedException e) { 1045 Thread.currentThread().interrupt(); 1046 throw new IOException( 1047 "Interrupted when connecting to zookeeper server", e); 1048 } 1049 } 1050 1051 private void setZooKeeperRef(ZooKeeper zk) { 1052 Preconditions.checkState(this.zk == null, 1053 "zk already set -- must be set exactly once"); 1054 this.zk = zk; 1055 hasSetZooKeeper.countDown(); 1056 } 1057 1058 @Override 1059 public void process(WatchedEvent event) { 1060 hasReceivedEvent.countDown(); 1061 try { 1062 hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS); 1063 ActiveStandbyElector.this.processWatchEvent( 1064 zk, event); 1065 } catch (Throwable t) { 1066 fatalError( 1067 "Failed to process watcher event " + event + ": " + 1068 StringUtils.stringifyException(t)); 1069 } 1070 } 1071 } 1072 1073 private static boolean isSuccess(Code code) { 1074 return (code == Code.OK); 1075 } 1076 1077 private static boolean isNodeExists(Code code) { 1078 return (code == Code.NODEEXISTS); 1079 } 1080 1081 private static boolean isNodeDoesNotExist(Code code) { 1082 return (code == Code.NONODE); 1083 } 1084 1085 private static boolean isSessionExpired(Code code) { 1086 return (code == Code.SESSIONEXPIRED); 1087 } 1088 1089 private static boolean shouldRetry(Code code) { 1090 switch (code) { 1091 case CONNECTIONLOSS: 1092 case OPERATIONTIMEOUT: 1093 return true; 1094 } 1095 return false; 1096 } 1097 1098 @Override 1099 public String toString() { 1100 return "elector id=" + System.identityHashCode(this) + 1101 " appData=" + 1102 ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 1103 " cb=" + appClient; 1104 } 1105 }