001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.ha; 019 020import java.io.IOException; 021import java.io.PrintStream; 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.Collection; 025import java.util.Map; 026 027import org.apache.commons.cli.Options; 028import org.apache.commons.cli.CommandLine; 029import org.apache.commons.cli.GnuParser; 030import org.apache.commons.cli.ParseException; 031import org.apache.commons.logging.Log; 032import org.apache.commons.logging.LogFactory; 033 034import org.apache.hadoop.classification.InterfaceAudience; 035import org.apache.hadoop.conf.Configuration; 036import org.apache.hadoop.conf.Configured; 037import org.apache.hadoop.fs.CommonConfigurationKeys; 038import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; 039import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo; 040import org.apache.hadoop.ha.HAServiceProtocol.RequestSource; 041import org.apache.hadoop.util.Tool; 042import org.apache.hadoop.util.ToolRunner; 043 044import com.google.common.base.Preconditions; 045import com.google.common.collect.ImmutableMap; 046 047/** 048 * A command-line tool for making calls in the HAServiceProtocol. 049 * For example,. this can be used to force a service to standby or active 050 * mode, or to trigger a health-check. 051 */ 052@InterfaceAudience.Private 053 054public abstract class HAAdmin extends Configured implements Tool { 055 056 private static final String FORCEFENCE = "forcefence"; 057 private static final String FORCEACTIVE = "forceactive"; 058 059 /** 060 * Undocumented flag which allows an administrator to use manual failover 061 * state transitions even when auto-failover is enabled. This is an unsafe 062 * operation, which is why it is not documented in the usage below. 063 */ 064 private static final String FORCEMANUAL = "forcemanual"; 065 private static final Log LOG = LogFactory.getLog(HAAdmin.class); 066 067 private int rpcTimeoutForChecks = -1; 068 069 protected final static Map<String, UsageInfo> USAGE = 070 ImmutableMap.<String, UsageInfo>builder() 071 .put("-transitionToActive", 072 new UsageInfo("<serviceId> [--"+FORCEACTIVE+"]", "Transitions the service into Active state")) 073 .put("-transitionToStandby", 074 new UsageInfo("<serviceId>", "Transitions the service into Standby state")) 075 .put("-failover", 076 new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>", 077 "Failover from the first service to the second.\n" + 078 "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" + 079 "Try to failover to the target service even if it is not ready if the " + 080 FORCEACTIVE + " option is used.")) 081 .put("-getServiceState", 082 new UsageInfo("<serviceId>", "Returns the state of the service")) 083 .put("-checkHealth", 084 new UsageInfo("<serviceId>", 085 "Requests that the service perform a health check.\n" + 086 "The HAAdmin tool will exit with a non-zero exit code\n" + 087 "if the check fails.")) 088 .put("-help", 089 new UsageInfo("<command>", "Displays help on the specified command")) 090 .build(); 091 092 /** Output stream for errors, for use in tests */ 093 protected PrintStream errOut = System.err; 094 protected PrintStream out = System.out; 095 private RequestSource requestSource = RequestSource.REQUEST_BY_USER; 096 097 protected HAAdmin() { 098 super(); 099 } 100 101 protected HAAdmin(Configuration conf) { 102 super(conf); 103 } 104 105 protected abstract HAServiceTarget resolveTarget(String string); 106 107 protected Collection<String> getTargetIds(String targetNodeToActivate) { 108 return new ArrayList<String>( 109 Arrays.asList(new String[]{targetNodeToActivate})); 110 } 111 112 protected String getUsageString() { 113 return "Usage: HAAdmin"; 114 } 115 116 protected void printUsage(PrintStream errOut) { 117 errOut.println(getUsageString()); 118 for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) { 119 String cmd = e.getKey(); 120 UsageInfo usage = e.getValue(); 121 122 errOut.println(" [" + cmd + " " + usage.args + "]"); 123 } 124 errOut.println(); 125 ToolRunner.printGenericCommandUsage(errOut); 126 } 127 128 private static void printUsage(PrintStream errOut, String cmd) { 129 UsageInfo usage = USAGE.get(cmd); 130 if (usage == null) { 131 throw new RuntimeException("No usage for cmd " + cmd); 132 } 133 errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]"); 134 } 135 136 private int transitionToActive(final CommandLine cmd) 137 throws IOException, ServiceFailedException { 138 String[] argv = cmd.getArgs(); 139 if (argv.length != 1) { 140 errOut.println("transitionToActive: incorrect number of arguments"); 141 printUsage(errOut, "-transitionToActive"); 142 return -1; 143 } 144 /* returns true if other target node is active or some exception occurred 145 and forceActive was not set */ 146 if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) { 147 return -1; 148 } 149 HAServiceTarget target = resolveTarget(argv[0]); 150 if (!checkManualStateManagementOK(target)) { 151 return -1; 152 } 153 HAServiceProtocol proto = target.getProxy( 154 getConf(), 0); 155 HAServiceProtocolHelper.transitionToActive(proto, createReqInfo()); 156 return 0; 157 } 158 159 /** 160 * Checks whether other target node is active or not 161 * @param targetNodeToActivate 162 * @return true if other target node is active or some other exception 163 * occurred and forceActive was set otherwise false 164 * @throws IOException 165 */ 166 private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive) 167 throws IOException { 168 Collection<String> targetIds = getTargetIds(targetNodeToActivate); 169 if(targetIds == null) { 170 errOut.println("transitionToActive: No target node in the " 171 + "current configuration"); 172 printUsage(errOut, "-transitionToActive"); 173 return true; 174 } 175 targetIds.remove(targetNodeToActivate); 176 for(String targetId : targetIds) { 177 HAServiceTarget target = resolveTarget(targetId); 178 if (!checkManualStateManagementOK(target)) { 179 return true; 180 } 181 try { 182 HAServiceProtocol proto = target.getProxy(getConf(), 5000); 183 if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) { 184 errOut.println("transitionToActive: Node " + targetId +" is already active"); 185 printUsage(errOut, "-transitionToActive"); 186 return true; 187 } 188 } catch (Exception e) { 189 //If forceActive switch is false then return true 190 if(!forceActive) { 191 errOut.println("Unexpected error occurred " + e.getMessage()); 192 printUsage(errOut, "-transitionToActive"); 193 return true; 194 } 195 } 196 } 197 return false; 198 } 199 200 private int transitionToStandby(final CommandLine cmd) 201 throws IOException, ServiceFailedException { 202 String[] argv = cmd.getArgs(); 203 if (argv.length != 1) { 204 errOut.println("transitionToStandby: incorrect number of arguments"); 205 printUsage(errOut, "-transitionToStandby"); 206 return -1; 207 } 208 209 HAServiceTarget target = resolveTarget(argv[0]); 210 if (!checkManualStateManagementOK(target)) { 211 return -1; 212 } 213 HAServiceProtocol proto = target.getProxy( 214 getConf(), 0); 215 HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo()); 216 return 0; 217 } 218 /** 219 * Ensure that we are allowed to manually manage the HA state of the target 220 * service. If automatic failover is configured, then the automatic 221 * failover controllers should be doing state management, and it is generally 222 * an error to use the HAAdmin command line to do so. 223 * 224 * @param target the target to check 225 * @return true if manual state management is allowed 226 */ 227 private boolean checkManualStateManagementOK(HAServiceTarget target) { 228 if (target.isAutoFailoverEnabled()) { 229 if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) { 230 errOut.println( 231 "Automatic failover is enabled for " + target + "\n" + 232 "Refusing to manually manage HA state, since it may cause\n" + 233 "a split-brain scenario or other incorrect state.\n" + 234 "If you are very sure you know what you are doing, please \n" + 235 "specify the " + FORCEMANUAL + " flag."); 236 return false; 237 } else { 238 LOG.warn("Proceeding with manual HA state management even though\n" + 239 "automatic failover is enabled for " + target); 240 return true; 241 } 242 } 243 return true; 244 } 245 246 private StateChangeRequestInfo createReqInfo() { 247 return new StateChangeRequestInfo(requestSource); 248 } 249 250 private int failover(CommandLine cmd) 251 throws IOException, ServiceFailedException { 252 boolean forceFence = cmd.hasOption(FORCEFENCE); 253 boolean forceActive = cmd.hasOption(FORCEACTIVE); 254 255 int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length; 256 final String[] args = cmd.getArgs(); 257 258 if (numOpts > 3 || args.length != 2) { 259 errOut.println("failover: incorrect arguments"); 260 printUsage(errOut, "-failover"); 261 return -1; 262 } 263 264 HAServiceTarget fromNode = resolveTarget(args[0]); 265 HAServiceTarget toNode = resolveTarget(args[1]); 266 267 // Check that auto-failover is consistently configured for both nodes. 268 Preconditions.checkState( 269 fromNode.isAutoFailoverEnabled() == 270 toNode.isAutoFailoverEnabled(), 271 "Inconsistent auto-failover configs between %s and %s!", 272 fromNode, toNode); 273 274 if (fromNode.isAutoFailoverEnabled()) { 275 if (forceFence || forceActive) { 276 // -forceActive doesn't make sense with auto-HA, since, if the node 277 // is not healthy, then its ZKFC will immediately quit the election 278 // again the next time a health check runs. 279 // 280 // -forceFence doesn't seem to have any real use cases with auto-HA 281 // so it isn't implemented. 282 errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " + 283 "supported with auto-failover enabled."); 284 return -1; 285 } 286 return gracefulFailoverThroughZKFCs(toNode); 287 } 288 289 FailoverController fc = new FailoverController(getConf(), 290 requestSource); 291 292 try { 293 fc.failover(fromNode, toNode, forceFence, forceActive); 294 out.println("Failover from "+args[0]+" to "+args[1]+" successful"); 295 } catch (FailoverFailedException ffe) { 296 errOut.println("Failover failed: " + ffe.getLocalizedMessage()); 297 return -1; 298 } 299 return 0; 300 } 301 302 303 /** 304 * Initiate a graceful failover by talking to the target node's ZKFC. 305 * This sends an RPC to the ZKFC, which coordinates the failover. 306 * 307 * @param toNode the node to fail to 308 * @return status code (0 for success) 309 * @throws IOException if failover does not succeed 310 */ 311 private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode) 312 throws IOException { 313 314 int timeout = FailoverController.getRpcTimeoutToNewActive(getConf()); 315 ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout); 316 try { 317 proxy.gracefulFailover(); 318 out.println("Failover to " + toNode + " successful"); 319 } catch (ServiceFailedException sfe) { 320 errOut.println("Failover failed: " + sfe.getLocalizedMessage()); 321 return -1; 322 } 323 324 return 0; 325 } 326 327 private int checkHealth(final CommandLine cmd) 328 throws IOException, ServiceFailedException { 329 String[] argv = cmd.getArgs(); 330 if (argv.length != 1) { 331 errOut.println("checkHealth: incorrect number of arguments"); 332 printUsage(errOut, "-checkHealth"); 333 return -1; 334 } 335 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy( 336 getConf(), rpcTimeoutForChecks); 337 try { 338 HAServiceProtocolHelper.monitorHealth(proto, createReqInfo()); 339 } catch (HealthCheckFailedException e) { 340 errOut.println("Health check failed: " + e.getLocalizedMessage()); 341 return -1; 342 } 343 return 0; 344 } 345 346 private int getServiceState(final CommandLine cmd) 347 throws IOException, ServiceFailedException { 348 String[] argv = cmd.getArgs(); 349 if (argv.length != 1) { 350 errOut.println("getServiceState: incorrect number of arguments"); 351 printUsage(errOut, "-getServiceState"); 352 return -1; 353 } 354 355 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy( 356 getConf(), rpcTimeoutForChecks); 357 out.println(proto.getServiceStatus().getState()); 358 return 0; 359 } 360 361 /** 362 * Return the serviceId as is, we are assuming it was 363 * given as a service address of form <host:ipcport>. 364 */ 365 protected String getServiceAddr(String serviceId) { 366 return serviceId; 367 } 368 369 @Override 370 public void setConf(Configuration conf) { 371 super.setConf(conf); 372 if (conf != null) { 373 rpcTimeoutForChecks = conf.getInt( 374 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY, 375 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT); 376 } 377 } 378 379 @Override 380 public int run(String[] argv) throws Exception { 381 try { 382 return runCmd(argv); 383 } catch (IllegalArgumentException iae) { 384 errOut.println("Illegal argument: " + iae.getLocalizedMessage()); 385 return -1; 386 } catch (IOException ioe) { 387 errOut.println("Operation failed: " + ioe.getLocalizedMessage()); 388 if (LOG.isDebugEnabled()) { 389 LOG.debug("Operation failed", ioe); 390 } 391 return -1; 392 } 393 } 394 395 protected int runCmd(String[] argv) throws Exception { 396 if (argv.length < 1) { 397 printUsage(errOut); 398 return -1; 399 } 400 401 String cmd = argv[0]; 402 403 if (!cmd.startsWith("-")) { 404 errOut.println("Bad command '" + cmd + "': expected command starting with '-'"); 405 printUsage(errOut); 406 return -1; 407 } 408 409 if (!USAGE.containsKey(cmd)) { 410 errOut.println(cmd.substring(1) + ": Unknown command"); 411 printUsage(errOut); 412 return -1; 413 } 414 415 Options opts = new Options(); 416 417 // Add command-specific options 418 if ("-failover".equals(cmd)) { 419 addFailoverCliOpts(opts); 420 } 421 if("-transitionToActive".equals(cmd)) { 422 addTransitionToActiveCliOpts(opts); 423 } 424 // Mutative commands take FORCEMANUAL option 425 if ("-transitionToActive".equals(cmd) || 426 "-transitionToStandby".equals(cmd) || 427 "-failover".equals(cmd)) { 428 opts.addOption(FORCEMANUAL, false, 429 "force manual control even if auto-failover is enabled"); 430 } 431 432 CommandLine cmdLine = parseOpts(cmd, opts, argv); 433 if (cmdLine == null) { 434 // error already printed 435 return -1; 436 } 437 438 if (cmdLine.hasOption(FORCEMANUAL)) { 439 if (!confirmForceManual()) { 440 LOG.fatal("Aborted"); 441 return -1; 442 } 443 // Instruct the NNs to honor this request even if they're 444 // configured for manual failover. 445 requestSource = RequestSource.REQUEST_BY_USER_FORCED; 446 } 447 448 if ("-transitionToActive".equals(cmd)) { 449 return transitionToActive(cmdLine); 450 } else if ("-transitionToStandby".equals(cmd)) { 451 return transitionToStandby(cmdLine); 452 } else if ("-failover".equals(cmd)) { 453 return failover(cmdLine); 454 } else if ("-getServiceState".equals(cmd)) { 455 return getServiceState(cmdLine); 456 } else if ("-checkHealth".equals(cmd)) { 457 return checkHealth(cmdLine); 458 } else if ("-help".equals(cmd)) { 459 return help(argv); 460 } else { 461 // we already checked command validity above, so getting here 462 // would be a coding error 463 throw new AssertionError("Should not get here, command: " + cmd); 464 } 465 } 466 467 private boolean confirmForceManual() throws IOException { 468 return ToolRunner.confirmPrompt( 469 "You have specified the " + FORCEMANUAL + " flag. This flag is " + 470 "dangerous, as it can induce a split-brain scenario that WILL " + 471 "CORRUPT your HDFS namespace, possibly irrecoverably.\n" + 472 "\n" + 473 "It is recommended not to use this flag, but instead to shut down the " + 474 "cluster and disable automatic failover if you prefer to manually " + 475 "manage your HA state.\n" + 476 "\n" + 477 "You may abort safely by answering 'n' or hitting ^C now.\n" + 478 "\n" + 479 "Are you sure you want to continue?"); 480 } 481 482 /** 483 * Add CLI options which are specific to the failover command and no 484 * others. 485 */ 486 private void addFailoverCliOpts(Options failoverOpts) { 487 failoverOpts.addOption(FORCEFENCE, false, "force fencing"); 488 failoverOpts.addOption(FORCEACTIVE, false, "force failover"); 489 // Don't add FORCEMANUAL, since that's added separately for all commands 490 // that change state. 491 } 492 493 /** 494 * Add CLI options which are specific to the transitionToActive command and 495 * no others. 496 */ 497 private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) { 498 transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active"); 499 } 500 501 private CommandLine parseOpts(String cmdName, Options opts, String[] argv) { 502 try { 503 // Strip off the first arg, since that's just the command name 504 argv = Arrays.copyOfRange(argv, 1, argv.length); 505 return new GnuParser().parse(opts, argv); 506 } catch (ParseException pe) { 507 errOut.println(cmdName.substring(1) + 508 ": incorrect arguments"); 509 printUsage(errOut, cmdName); 510 return null; 511 } 512 } 513 514 private int help(String[] argv) { 515 if (argv.length == 1) { // only -help 516 printUsage(out); 517 return 0; 518 } else if (argv.length != 2) { 519 printUsage(errOut, "-help"); 520 return -1; 521 } 522 String cmd = argv[1]; 523 if (!cmd.startsWith("-")) { 524 cmd = "-" + cmd; 525 } 526 UsageInfo usageInfo = USAGE.get(cmd); 527 if (usageInfo == null) { 528 errOut.println(cmd + ": Unknown command"); 529 printUsage(errOut); 530 return -1; 531 } 532 533 out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help); 534 return 0; 535 } 536 537 protected static class UsageInfo { 538 public final String args; 539 public final String help; 540 541 public UsageInfo(String args, String help) { 542 this.args = args; 543 this.help = help; 544 } 545 } 546}