001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.ha;
019
020import java.io.IOException;
021import java.io.PrintStream;
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.Collection;
025import java.util.Map;
026
027import org.apache.commons.cli.Options;
028import org.apache.commons.cli.CommandLine;
029import org.apache.commons.cli.GnuParser;
030import org.apache.commons.cli.ParseException;
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033
034import org.apache.hadoop.classification.InterfaceAudience;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.conf.Configured;
037import org.apache.hadoop.fs.CommonConfigurationKeys;
038import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
039import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
040import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
041import org.apache.hadoop.util.Tool;
042import org.apache.hadoop.util.ToolRunner;
043
044import com.google.common.base.Preconditions;
045import com.google.common.collect.ImmutableMap;
046
047/**
048 * A command-line tool for making calls in the HAServiceProtocol.
049 * For example,. this can be used to force a service to standby or active
050 * mode, or to trigger a health-check.
051 */
052@InterfaceAudience.Private
053
054public abstract class HAAdmin extends Configured implements Tool {
055  
056  private static final String FORCEFENCE  = "forcefence";
057  private static final String FORCEACTIVE = "forceactive";
058  
059  /**
060   * Undocumented flag which allows an administrator to use manual failover
061   * state transitions even when auto-failover is enabled. This is an unsafe
062   * operation, which is why it is not documented in the usage below.
063   */
064  private static final String FORCEMANUAL = "forcemanual";
065  private static final Log LOG = LogFactory.getLog(HAAdmin.class);
066
067  private int rpcTimeoutForChecks = -1;
068  
069  protected final static Map<String, UsageInfo> USAGE =
070    ImmutableMap.<String, UsageInfo>builder()
071    .put("-transitionToActive",
072        new UsageInfo("[--"+FORCEACTIVE+"] <serviceId>", "Transitions the service into Active state"))
073    .put("-transitionToStandby",
074        new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
075    .put("-failover",
076        new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
077            "Failover from the first service to the second.\n" +
078            "Unconditionally fence services if the --"+FORCEFENCE+" option is used.\n" +
079            "Try to failover to the target service even if it is not ready if the " + 
080            "--" + FORCEACTIVE + " option is used."))
081    .put("-getServiceState",
082        new UsageInfo("<serviceId>", "Returns the state of the service"))
083    .put("-checkHealth",
084        new UsageInfo("<serviceId>",
085            "Requests that the service perform a health check.\n" + 
086            "The HAAdmin tool will exit with a non-zero exit code\n" +
087            "if the check fails."))
088    .put("-help",
089        new UsageInfo("<command>", "Displays help on the specified command"))
090    .build();
091
092  /** Output stream for errors, for use in tests */
093  protected PrintStream errOut = System.err;
094  protected PrintStream out = System.out;
095  private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
096
097  protected HAAdmin() {
098    super();
099  }
100
101  protected HAAdmin(Configuration conf) {
102    super(conf);
103  }
104
105  protected abstract HAServiceTarget resolveTarget(String string);
106  
107  protected Collection<String> getTargetIds(String targetNodeToActivate) {
108    return new ArrayList<String>(
109        Arrays.asList(new String[]{targetNodeToActivate}));
110  }
111
112  protected String getUsageString() {
113    return "Usage: HAAdmin";
114  }
115
116  protected void printUsage(PrintStream errOut) {
117    errOut.println(getUsageString());
118    for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
119      String cmd = e.getKey();
120      UsageInfo usage = e.getValue();
121      
122      errOut.println("    [" + cmd + " " + usage.args + "]"); 
123    }
124    errOut.println();
125    ToolRunner.printGenericCommandUsage(errOut);    
126  }
127  
128  private void printUsage(PrintStream errOut, String cmd) {
129    UsageInfo usage = USAGE.get(cmd);
130    if (usage == null) {
131      throw new RuntimeException("No usage for cmd " + cmd);
132    }
133    errOut.println(getUsageString() + " [" + cmd + " " + usage.args + "]");
134  }
135
136  private int transitionToActive(final CommandLine cmd)
137      throws IOException, ServiceFailedException {
138    String[] argv = cmd.getArgs();
139    if (argv.length != 1) {
140      errOut.println("transitionToActive: incorrect number of arguments");
141      printUsage(errOut, "-transitionToActive");
142      return -1;
143    }
144    /*  returns true if other target node is active or some exception occurred 
145        and forceActive was not set  */
146    if(!cmd.hasOption(FORCEACTIVE)) {
147      if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) {
148        return -1;
149      }
150    }
151    HAServiceTarget target = resolveTarget(argv[0]);
152    if (!checkManualStateManagementOK(target)) {
153      return -1;
154    }
155    HAServiceProtocol proto = target.getProxy(
156        getConf(), 0);
157    HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
158    return 0;
159  }
160  
161  /**
162   * Checks whether other target node is active or not
163   * @param targetNodeToActivate
164   * @return true if other target node is active or some other exception 
165   * occurred and forceActive was set otherwise false
166   * @throws IOException
167   */
168  private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive)
169      throws IOException  {
170    Collection<String> targetIds = getTargetIds(targetNodeToActivate);
171    targetIds.remove(targetNodeToActivate);
172    for(String targetId : targetIds) {
173      HAServiceTarget target = resolveTarget(targetId);
174      if (!checkManualStateManagementOK(target)) {
175        return true;
176      }
177      try {
178        HAServiceProtocol proto = target.getProxy(getConf(), 5000);
179        if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) {
180          errOut.println("transitionToActive: Node " +  targetId +" is already active");
181          printUsage(errOut, "-transitionToActive");
182          return true;
183        }
184      } catch (Exception e) {
185        //If forceActive switch is false then return true
186        if(!forceActive) {
187          errOut.println("Unexpected error occurred  " + e.getMessage());
188          printUsage(errOut, "-transitionToActive");
189          return true; 
190        }
191      }
192    }
193    return false;
194  }
195  
196  private int transitionToStandby(final CommandLine cmd)
197      throws IOException, ServiceFailedException {
198    String[] argv = cmd.getArgs();
199    if (argv.length != 1) {
200      errOut.println("transitionToStandby: incorrect number of arguments");
201      printUsage(errOut, "-transitionToStandby");
202      return -1;
203    }
204    
205    HAServiceTarget target = resolveTarget(argv[0]);
206    if (!checkManualStateManagementOK(target)) {
207      return -1;
208    }
209    HAServiceProtocol proto = target.getProxy(
210        getConf(), 0);
211    HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
212    return 0;
213  }
214  /**
215   * Ensure that we are allowed to manually manage the HA state of the target
216   * service. If automatic failover is configured, then the automatic
217   * failover controllers should be doing state management, and it is generally
218   * an error to use the HAAdmin command line to do so.
219   * 
220   * @param target the target to check
221   * @return true if manual state management is allowed
222   */
223  private boolean checkManualStateManagementOK(HAServiceTarget target) {
224    if (target.isAutoFailoverEnabled()) {
225      if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
226        errOut.println(
227            "Automatic failover is enabled for " + target + "\n" +
228            "Refusing to manually manage HA state, since it may cause\n" +
229            "a split-brain scenario or other incorrect state.\n" +
230            "If you are very sure you know what you are doing, please \n" +
231            "specify the --" + FORCEMANUAL + " flag.");
232        return false;
233      } else {
234        LOG.warn("Proceeding with manual HA state management even though\n" +
235            "automatic failover is enabled for " + target);
236        return true;
237      }
238    }
239    return true;
240  }
241
242  private StateChangeRequestInfo createReqInfo() {
243    return new StateChangeRequestInfo(requestSource);
244  }
245
246  private int failover(CommandLine cmd)
247      throws IOException, ServiceFailedException {
248    boolean forceFence = cmd.hasOption(FORCEFENCE);
249    boolean forceActive = cmd.hasOption(FORCEACTIVE);
250
251    int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
252    final String[] args = cmd.getArgs();
253
254    if (numOpts > 3 || args.length != 2) {
255      errOut.println("failover: incorrect arguments");
256      printUsage(errOut, "-failover");
257      return -1;
258    }
259
260    HAServiceTarget fromNode = resolveTarget(args[0]);
261    HAServiceTarget toNode = resolveTarget(args[1]);
262    
263    // Check that auto-failover is consistently configured for both nodes.
264    Preconditions.checkState(
265        fromNode.isAutoFailoverEnabled() ==
266          toNode.isAutoFailoverEnabled(),
267          "Inconsistent auto-failover configs between %s and %s!",
268          fromNode, toNode);
269    
270    if (fromNode.isAutoFailoverEnabled()) {
271      if (forceFence || forceActive) {
272        // -forceActive doesn't make sense with auto-HA, since, if the node
273        // is not healthy, then its ZKFC will immediately quit the election
274        // again the next time a health check runs.
275        //
276        // -forceFence doesn't seem to have any real use cases with auto-HA
277        // so it isn't implemented.
278        errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
279            "supported with auto-failover enabled.");
280        return -1;
281      }
282      return gracefulFailoverThroughZKFCs(toNode);
283    }
284    
285    FailoverController fc = new FailoverController(getConf(),
286        requestSource);
287    
288    try {
289      fc.failover(fromNode, toNode, forceFence, forceActive); 
290      out.println("Failover from "+args[0]+" to "+args[1]+" successful");
291    } catch (FailoverFailedException ffe) {
292      errOut.println("Failover failed: " + ffe.getLocalizedMessage());
293      return -1;
294    }
295    return 0;
296  }
297  
298
299  /**
300   * Initiate a graceful failover by talking to the target node's ZKFC.
301   * This sends an RPC to the ZKFC, which coordinates the failover.
302   * 
303   * @param toNode the node to fail to
304   * @return status code (0 for success)
305   * @throws IOException if failover does not succeed
306   */
307  private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
308      throws IOException {
309
310    int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
311    ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
312    try {
313      proxy.gracefulFailover();
314      out.println("Failover to " + toNode + " successful");
315    } catch (ServiceFailedException sfe) {
316      errOut.println("Failover failed: " + sfe.getLocalizedMessage());
317      return -1;
318    }
319
320    return 0;
321  }
322
323  private int checkHealth(final CommandLine cmd)
324      throws IOException, ServiceFailedException {
325    String[] argv = cmd.getArgs();
326    if (argv.length != 1) {
327      errOut.println("checkHealth: incorrect number of arguments");
328      printUsage(errOut, "-checkHealth");
329      return -1;
330    }
331    HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
332        getConf(), rpcTimeoutForChecks);
333    try {
334      HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
335    } catch (HealthCheckFailedException e) {
336      errOut.println("Health check failed: " + e.getLocalizedMessage());
337      return -1;
338    }
339    return 0;
340  }
341
342  private int getServiceState(final CommandLine cmd)
343      throws IOException, ServiceFailedException {
344    String[] argv = cmd.getArgs();
345    if (argv.length != 1) {
346      errOut.println("getServiceState: incorrect number of arguments");
347      printUsage(errOut, "-getServiceState");
348      return -1;
349    }
350
351    HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
352        getConf(), rpcTimeoutForChecks);
353    out.println(proto.getServiceStatus().getState());
354    return 0;
355  }
356
357  /**
358   * Return the serviceId as is, we are assuming it was
359   * given as a service address of form <host:ipcport>.
360   */
361  protected String getServiceAddr(String serviceId) {
362    return serviceId;
363  }
364
365  @Override
366  public void setConf(Configuration conf) {
367    super.setConf(conf);
368    if (conf != null) {
369      rpcTimeoutForChecks = conf.getInt(
370          CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
371          CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
372    }
373  }
374
375  @Override
376  public int run(String[] argv) throws Exception {
377    try {
378      return runCmd(argv);
379    } catch (IllegalArgumentException iae) {
380      errOut.println("Illegal argument: " + iae.getLocalizedMessage());
381      return -1;
382    } catch (IOException ioe) {
383      errOut.println("Operation failed: " + ioe.getLocalizedMessage());
384      if (LOG.isDebugEnabled()) {
385        LOG.debug("Operation failed", ioe);
386      }
387      return -1;
388    }
389  }
390  
391  protected int runCmd(String[] argv) throws Exception {
392    if (argv.length < 1) {
393      printUsage(errOut);
394      return -1;
395    }
396
397    String cmd = argv[0];
398
399    if (!cmd.startsWith("-")) {
400      errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
401      printUsage(errOut);
402      return -1;
403    }
404    
405    if (!USAGE.containsKey(cmd)) {
406      errOut.println(cmd.substring(1) + ": Unknown command");
407      printUsage(errOut);
408      return -1;
409    }
410    
411    Options opts = new Options();
412
413    // Add command-specific options
414    if ("-failover".equals(cmd)) {
415      addFailoverCliOpts(opts);
416    }
417    if("-transitionToActive".equals(cmd)) {
418      addTransitionToActiveCliOpts(opts);
419    }
420    // Mutative commands take FORCEMANUAL option
421    if ("-transitionToActive".equals(cmd) ||
422        "-transitionToStandby".equals(cmd) ||
423        "-failover".equals(cmd)) {
424      opts.addOption(FORCEMANUAL, false,
425          "force manual control even if auto-failover is enabled");
426    }
427         
428    CommandLine cmdLine = parseOpts(cmd, opts, argv);
429    if (cmdLine == null) {
430      // error already printed
431      return -1;
432    }
433    
434    if (cmdLine.hasOption(FORCEMANUAL)) {
435      if (!confirmForceManual()) {
436        LOG.fatal("Aborted");
437        return -1;
438      }
439      // Instruct the NNs to honor this request even if they're
440      // configured for manual failover.
441      requestSource = RequestSource.REQUEST_BY_USER_FORCED;
442    }
443
444    if ("-transitionToActive".equals(cmd)) {
445      return transitionToActive(cmdLine);
446    } else if ("-transitionToStandby".equals(cmd)) {
447      return transitionToStandby(cmdLine);
448    } else if ("-failover".equals(cmd)) {
449      return failover(cmdLine);
450    } else if ("-getServiceState".equals(cmd)) {
451      return getServiceState(cmdLine);
452    } else if ("-checkHealth".equals(cmd)) {
453      return checkHealth(cmdLine);
454    } else if ("-help".equals(cmd)) {
455      return help(argv);
456    } else {
457      // we already checked command validity above, so getting here
458      // would be a coding error
459      throw new AssertionError("Should not get here, command: " + cmd);
460    } 
461  }
462  
463  private boolean confirmForceManual() throws IOException {
464     return ToolRunner.confirmPrompt(
465        "You have specified the --" + FORCEMANUAL + " flag. This flag is " +
466        "dangerous, as it can induce a split-brain scenario that WILL " +
467        "CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
468        "\n" +
469        "It is recommended not to use this flag, but instead to shut down the " +
470        "cluster and disable automatic failover if you prefer to manually " +
471        "manage your HA state.\n" +
472        "\n" +
473        "You may abort safely by answering 'n' or hitting ^C now.\n" +
474        "\n" +
475        "Are you sure you want to continue?");
476  }
477
478  /**
479   * Add CLI options which are specific to the failover command and no
480   * others.
481   */
482  private void addFailoverCliOpts(Options failoverOpts) {
483    failoverOpts.addOption(FORCEFENCE, false, "force fencing");
484    failoverOpts.addOption(FORCEACTIVE, false, "force failover");
485    // Don't add FORCEMANUAL, since that's added separately for all commands
486    // that change state.
487  }
488  
489  /**
490   * Add CLI options which are specific to the transitionToActive command and
491   * no others.
492   */
493  private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) {
494    transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active");
495  }
496  
497  private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
498    try {
499      // Strip off the first arg, since that's just the command name
500      argv = Arrays.copyOfRange(argv, 1, argv.length); 
501      return new GnuParser().parse(opts, argv);
502    } catch (ParseException pe) {
503      errOut.println(cmdName.substring(1) +
504          ": incorrect arguments");
505      printUsage(errOut, cmdName);
506      return null;
507    }
508  }
509  
510  private int help(String[] argv) {
511    if (argv.length == 1) { // only -help
512      printUsage(out);
513      return 0;
514    } else if (argv.length != 2) {
515      printUsage(errOut, "-help");
516      return -1;
517    }
518    String cmd = argv[1];
519    if (!cmd.startsWith("-")) {
520      cmd = "-" + cmd;
521    }
522    UsageInfo usageInfo = USAGE.get(cmd);
523    if (usageInfo == null) {
524      errOut.println(cmd + ": Unknown command");
525      printUsage(errOut);
526      return -1;
527    }
528    
529    out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
530    return 0;
531  }
532  
533  protected static class UsageInfo {
534    public final String args;
535    public final String help;
536    
537    public UsageInfo(String args, String help) {
538      this.args = args;
539      this.help = help;
540    }
541  }
542}