001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.ha; 019 020 import java.io.IOException; 021 import java.io.PrintStream; 022 import java.util.Arrays; 023 import java.util.Map; 024 025 import org.apache.commons.cli.Options; 026 import org.apache.commons.cli.CommandLine; 027 import org.apache.commons.cli.GnuParser; 028 import org.apache.commons.cli.ParseException; 029 import org.apache.commons.logging.Log; 030 import org.apache.commons.logging.LogFactory; 031 032 import org.apache.hadoop.classification.InterfaceAudience; 033 import org.apache.hadoop.conf.Configuration; 034 import org.apache.hadoop.conf.Configured; 035 import org.apache.hadoop.fs.CommonConfigurationKeys; 036 import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo; 037 import org.apache.hadoop.ha.HAServiceProtocol.RequestSource; 038 import org.apache.hadoop.util.Tool; 039 import org.apache.hadoop.util.ToolRunner; 040 041 import com.google.common.base.Preconditions; 042 import com.google.common.collect.ImmutableMap; 043 044 /** 045 * A command-line tool for making calls in the HAServiceProtocol. 046 * For example,. this can be used to force a service to standby or active 047 * mode, or to trigger a health-check. 048 */ 049 @InterfaceAudience.Private 050 051 public abstract class HAAdmin extends Configured implements Tool { 052 053 private static final String FORCEFENCE = "forcefence"; 054 private static final String FORCEACTIVE = "forceactive"; 055 056 /** 057 * Undocumented flag which allows an administrator to use manual failover 058 * state transitions even when auto-failover is enabled. This is an unsafe 059 * operation, which is why it is not documented in the usage below. 060 */ 061 private static final String FORCEMANUAL = "forcemanual"; 062 private static final Log LOG = LogFactory.getLog(HAAdmin.class); 063 064 private int rpcTimeoutForChecks = -1; 065 066 protected final static Map<String, UsageInfo> USAGE = 067 ImmutableMap.<String, UsageInfo>builder() 068 .put("-transitionToActive", 069 new UsageInfo("<serviceId>", "Transitions the service into Active state")) 070 .put("-transitionToStandby", 071 new UsageInfo("<serviceId>", "Transitions the service into Standby state")) 072 .put("-failover", 073 new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>", 074 "Failover from the first service to the second.\n" + 075 "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" + 076 "Try to failover to the target service even if it is not ready if the " + 077 FORCEACTIVE + " option is used.")) 078 .put("-getServiceState", 079 new UsageInfo("<serviceId>", "Returns the state of the service")) 080 .put("-checkHealth", 081 new UsageInfo("<serviceId>", 082 "Requests that the service perform a health check.\n" + 083 "The HAAdmin tool will exit with a non-zero exit code\n" + 084 "if the check fails.")) 085 .put("-help", 086 new UsageInfo("<command>", "Displays help on the specified command")) 087 .build(); 088 089 /** Output stream for errors, for use in tests */ 090 protected PrintStream errOut = System.err; 091 protected PrintStream out = System.out; 092 private RequestSource requestSource = RequestSource.REQUEST_BY_USER; 093 094 protected HAAdmin() { 095 super(); 096 } 097 098 protected HAAdmin(Configuration conf) { 099 super(conf); 100 } 101 102 protected abstract HAServiceTarget resolveTarget(String string); 103 104 protected String getUsageString() { 105 return "Usage: HAAdmin"; 106 } 107 108 protected void printUsage(PrintStream errOut) { 109 errOut.println(getUsageString()); 110 for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) { 111 String cmd = e.getKey(); 112 UsageInfo usage = e.getValue(); 113 114 errOut.println(" [" + cmd + " " + usage.args + "]"); 115 } 116 errOut.println(); 117 ToolRunner.printGenericCommandUsage(errOut); 118 } 119 120 private static void printUsage(PrintStream errOut, String cmd) { 121 UsageInfo usage = USAGE.get(cmd); 122 if (usage == null) { 123 throw new RuntimeException("No usage for cmd " + cmd); 124 } 125 errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]"); 126 } 127 128 private int transitionToActive(final CommandLine cmd) 129 throws IOException, ServiceFailedException { 130 String[] argv = cmd.getArgs(); 131 if (argv.length != 1) { 132 errOut.println("transitionToActive: incorrect number of arguments"); 133 printUsage(errOut, "-transitionToActive"); 134 return -1; 135 } 136 HAServiceTarget target = resolveTarget(argv[0]); 137 if (!checkManualStateManagementOK(target)) { 138 return -1; 139 } 140 HAServiceProtocol proto = target.getProxy( 141 getConf(), 0); 142 HAServiceProtocolHelper.transitionToActive(proto, createReqInfo()); 143 return 0; 144 } 145 146 private int transitionToStandby(final CommandLine cmd) 147 throws IOException, ServiceFailedException { 148 String[] argv = cmd.getArgs(); 149 if (argv.length != 1) { 150 errOut.println("transitionToStandby: incorrect number of arguments"); 151 printUsage(errOut, "-transitionToStandby"); 152 return -1; 153 } 154 155 HAServiceTarget target = resolveTarget(argv[0]); 156 if (!checkManualStateManagementOK(target)) { 157 return -1; 158 } 159 HAServiceProtocol proto = target.getProxy( 160 getConf(), 0); 161 HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo()); 162 return 0; 163 } 164 /** 165 * Ensure that we are allowed to manually manage the HA state of the target 166 * service. If automatic failover is configured, then the automatic 167 * failover controllers should be doing state management, and it is generally 168 * an error to use the HAAdmin command line to do so. 169 * 170 * @param target the target to check 171 * @return true if manual state management is allowed 172 */ 173 private boolean checkManualStateManagementOK(HAServiceTarget target) { 174 if (target.isAutoFailoverEnabled()) { 175 if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) { 176 errOut.println( 177 "Automatic failover is enabled for " + target + "\n" + 178 "Refusing to manually manage HA state, since it may cause\n" + 179 "a split-brain scenario or other incorrect state.\n" + 180 "If you are very sure you know what you are doing, please \n" + 181 "specify the " + FORCEMANUAL + " flag."); 182 return false; 183 } else { 184 LOG.warn("Proceeding with manual HA state management even though\n" + 185 "automatic failover is enabled for " + target); 186 return true; 187 } 188 } 189 return true; 190 } 191 192 private StateChangeRequestInfo createReqInfo() { 193 return new StateChangeRequestInfo(requestSource); 194 } 195 196 private int failover(CommandLine cmd) 197 throws IOException, ServiceFailedException { 198 boolean forceFence = cmd.hasOption(FORCEFENCE); 199 boolean forceActive = cmd.hasOption(FORCEACTIVE); 200 201 int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length; 202 final String[] args = cmd.getArgs(); 203 204 if (numOpts > 3 || args.length != 2) { 205 errOut.println("failover: incorrect arguments"); 206 printUsage(errOut, "-failover"); 207 return -1; 208 } 209 210 HAServiceTarget fromNode = resolveTarget(args[0]); 211 HAServiceTarget toNode = resolveTarget(args[1]); 212 213 // Check that auto-failover is consistently configured for both nodes. 214 Preconditions.checkState( 215 fromNode.isAutoFailoverEnabled() == 216 toNode.isAutoFailoverEnabled(), 217 "Inconsistent auto-failover configs between %s and %s!", 218 fromNode, toNode); 219 220 if (fromNode.isAutoFailoverEnabled()) { 221 if (forceFence || forceActive) { 222 // -forceActive doesn't make sense with auto-HA, since, if the node 223 // is not healthy, then its ZKFC will immediately quit the election 224 // again the next time a health check runs. 225 // 226 // -forceFence doesn't seem to have any real use cases with auto-HA 227 // so it isn't implemented. 228 errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " + 229 "supported with auto-failover enabled."); 230 return -1; 231 } 232 return gracefulFailoverThroughZKFCs(toNode); 233 } 234 235 FailoverController fc = new FailoverController(getConf(), 236 requestSource); 237 238 try { 239 fc.failover(fromNode, toNode, forceFence, forceActive); 240 out.println("Failover from "+args[0]+" to "+args[1]+" successful"); 241 } catch (FailoverFailedException ffe) { 242 errOut.println("Failover failed: " + ffe.getLocalizedMessage()); 243 return -1; 244 } 245 return 0; 246 } 247 248 249 /** 250 * Initiate a graceful failover by talking to the target node's ZKFC. 251 * This sends an RPC to the ZKFC, which coordinates the failover. 252 * 253 * @param toNode the node to fail to 254 * @return status code (0 for success) 255 * @throws IOException if failover does not succeed 256 */ 257 private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode) 258 throws IOException { 259 260 int timeout = FailoverController.getRpcTimeoutToNewActive(getConf()); 261 ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout); 262 try { 263 proxy.gracefulFailover(); 264 out.println("Failover to " + toNode + " successful"); 265 } catch (ServiceFailedException sfe) { 266 errOut.println("Failover failed: " + sfe.getLocalizedMessage()); 267 return -1; 268 } 269 270 return 0; 271 } 272 273 private int checkHealth(final CommandLine cmd) 274 throws IOException, ServiceFailedException { 275 String[] argv = cmd.getArgs(); 276 if (argv.length != 1) { 277 errOut.println("checkHealth: incorrect number of arguments"); 278 printUsage(errOut, "-checkHealth"); 279 return -1; 280 } 281 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy( 282 getConf(), rpcTimeoutForChecks); 283 try { 284 HAServiceProtocolHelper.monitorHealth(proto, createReqInfo()); 285 } catch (HealthCheckFailedException e) { 286 errOut.println("Health check failed: " + e.getLocalizedMessage()); 287 return -1; 288 } 289 return 0; 290 } 291 292 private int getServiceState(final CommandLine cmd) 293 throws IOException, ServiceFailedException { 294 String[] argv = cmd.getArgs(); 295 if (argv.length != 1) { 296 errOut.println("getServiceState: incorrect number of arguments"); 297 printUsage(errOut, "-getServiceState"); 298 return -1; 299 } 300 301 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy( 302 getConf(), rpcTimeoutForChecks); 303 out.println(proto.getServiceStatus().getState()); 304 return 0; 305 } 306 307 /** 308 * Return the serviceId as is, we are assuming it was 309 * given as a service address of form <host:ipcport>. 310 */ 311 protected String getServiceAddr(String serviceId) { 312 return serviceId; 313 } 314 315 @Override 316 public void setConf(Configuration conf) { 317 super.setConf(conf); 318 if (conf != null) { 319 rpcTimeoutForChecks = conf.getInt( 320 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY, 321 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT); 322 } 323 } 324 325 @Override 326 public int run(String[] argv) throws Exception { 327 try { 328 return runCmd(argv); 329 } catch (IllegalArgumentException iae) { 330 errOut.println("Illegal argument: " + iae.getLocalizedMessage()); 331 return -1; 332 } catch (IOException ioe) { 333 errOut.println("Operation failed: " + ioe.getLocalizedMessage()); 334 if (LOG.isDebugEnabled()) { 335 LOG.debug("Operation failed", ioe); 336 } 337 return -1; 338 } 339 } 340 341 protected int runCmd(String[] argv) throws Exception { 342 if (argv.length < 1) { 343 printUsage(errOut); 344 return -1; 345 } 346 347 String cmd = argv[0]; 348 349 if (!cmd.startsWith("-")) { 350 errOut.println("Bad command '" + cmd + "': expected command starting with '-'"); 351 printUsage(errOut); 352 return -1; 353 } 354 355 if (!USAGE.containsKey(cmd)) { 356 errOut.println(cmd.substring(1) + ": Unknown command"); 357 printUsage(errOut); 358 return -1; 359 } 360 361 Options opts = new Options(); 362 363 // Add command-specific options 364 if ("-failover".equals(cmd)) { 365 addFailoverCliOpts(opts); 366 } 367 // Mutative commands take FORCEMANUAL option 368 if ("-transitionToActive".equals(cmd) || 369 "-transitionToStandby".equals(cmd) || 370 "-failover".equals(cmd)) { 371 opts.addOption(FORCEMANUAL, false, 372 "force manual control even if auto-failover is enabled"); 373 } 374 375 CommandLine cmdLine = parseOpts(cmd, opts, argv); 376 if (cmdLine == null) { 377 // error already printed 378 return -1; 379 } 380 381 if (cmdLine.hasOption(FORCEMANUAL)) { 382 if (!confirmForceManual()) { 383 LOG.fatal("Aborted"); 384 return -1; 385 } 386 // Instruct the NNs to honor this request even if they're 387 // configured for manual failover. 388 requestSource = RequestSource.REQUEST_BY_USER_FORCED; 389 } 390 391 if ("-transitionToActive".equals(cmd)) { 392 return transitionToActive(cmdLine); 393 } else if ("-transitionToStandby".equals(cmd)) { 394 return transitionToStandby(cmdLine); 395 } else if ("-failover".equals(cmd)) { 396 return failover(cmdLine); 397 } else if ("-getServiceState".equals(cmd)) { 398 return getServiceState(cmdLine); 399 } else if ("-checkHealth".equals(cmd)) { 400 return checkHealth(cmdLine); 401 } else if ("-help".equals(cmd)) { 402 return help(argv); 403 } else { 404 // we already checked command validity above, so getting here 405 // would be a coding error 406 throw new AssertionError("Should not get here, command: " + cmd); 407 } 408 } 409 410 private boolean confirmForceManual() throws IOException { 411 return ToolRunner.confirmPrompt( 412 "You have specified the " + FORCEMANUAL + " flag. This flag is " + 413 "dangerous, as it can induce a split-brain scenario that WILL " + 414 "CORRUPT your HDFS namespace, possibly irrecoverably.\n" + 415 "\n" + 416 "It is recommended not to use this flag, but instead to shut down the " + 417 "cluster and disable automatic failover if you prefer to manually " + 418 "manage your HA state.\n" + 419 "\n" + 420 "You may abort safely by answering 'n' or hitting ^C now.\n" + 421 "\n" + 422 "Are you sure you want to continue?"); 423 } 424 425 /** 426 * Add CLI options which are specific to the failover command and no 427 * others. 428 */ 429 private void addFailoverCliOpts(Options failoverOpts) { 430 failoverOpts.addOption(FORCEFENCE, false, "force fencing"); 431 failoverOpts.addOption(FORCEACTIVE, false, "force failover"); 432 // Don't add FORCEMANUAL, since that's added separately for all commands 433 // that change state. 434 } 435 436 private CommandLine parseOpts(String cmdName, Options opts, String[] argv) { 437 try { 438 // Strip off the first arg, since that's just the command name 439 argv = Arrays.copyOfRange(argv, 1, argv.length); 440 return new GnuParser().parse(opts, argv); 441 } catch (ParseException pe) { 442 errOut.println(cmdName.substring(1) + 443 ": incorrect arguments"); 444 printUsage(errOut, cmdName); 445 return null; 446 } 447 } 448 449 private int help(String[] argv) { 450 if (argv.length == 1) { // only -help 451 printUsage(out); 452 return 0; 453 } else if (argv.length != 2) { 454 printUsage(errOut, "-help"); 455 return -1; 456 } 457 String cmd = argv[1]; 458 if (!cmd.startsWith("-")) { 459 cmd = "-" + cmd; 460 } 461 UsageInfo usageInfo = USAGE.get(cmd); 462 if (usageInfo == null) { 463 errOut.println(cmd + ": Unknown command"); 464 printUsage(errOut); 465 return -1; 466 } 467 468 out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help); 469 return 0; 470 } 471 472 protected static class UsageInfo { 473 public final String args; 474 public final String help; 475 476 public UsageInfo(String args, String help) { 477 this.args = args; 478 this.help = help; 479 } 480 } 481 }