001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.ha;
019
020import java.io.IOException;
021import java.io.PrintStream;
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.Collection;
025import java.util.Map;
026
027import org.apache.commons.cli.Options;
028import org.apache.commons.cli.CommandLine;
029import org.apache.commons.cli.GnuParser;
030import org.apache.commons.cli.ParseException;
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033
034import org.apache.hadoop.classification.InterfaceAudience;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.conf.Configured;
037import org.apache.hadoop.fs.CommonConfigurationKeys;
038import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
039import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
040import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
041import org.apache.hadoop.util.Tool;
042import org.apache.hadoop.util.ToolRunner;
043
044import com.google.common.base.Preconditions;
045import com.google.common.collect.ImmutableMap;
046
047/**
048 * A command-line tool for making calls in the HAServiceProtocol.
049 * For example,. this can be used to force a service to standby or active
050 * mode, or to trigger a health-check.
051 */
052@InterfaceAudience.Private
053
054public abstract class HAAdmin extends Configured implements Tool {
055  
056  private static final String FORCEFENCE  = "forcefence";
057  private static final String FORCEACTIVE = "forceactive";
058  
059  /**
060   * Undocumented flag which allows an administrator to use manual failover
061   * state transitions even when auto-failover is enabled. This is an unsafe
062   * operation, which is why it is not documented in the usage below.
063   */
064  private static final String FORCEMANUAL = "forcemanual";
065  private static final Log LOG = LogFactory.getLog(HAAdmin.class);
066
067  private int rpcTimeoutForChecks = -1;
068  
069  protected final static Map<String, UsageInfo> USAGE =
070    ImmutableMap.<String, UsageInfo>builder()
071    .put("-transitionToActive",
072        new UsageInfo("<serviceId> [--"+FORCEACTIVE+"]", "Transitions the service into Active state"))
073    .put("-transitionToStandby",
074        new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
075    .put("-failover",
076        new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
077            "Failover from the first service to the second.\n" +
078            "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
079            "Try to failover to the target service even if it is not ready if the " + 
080            FORCEACTIVE + " option is used."))
081    .put("-getServiceState",
082        new UsageInfo("<serviceId>", "Returns the state of the service"))
083    .put("-checkHealth",
084        new UsageInfo("<serviceId>",
085            "Requests that the service perform a health check.\n" + 
086            "The HAAdmin tool will exit with a non-zero exit code\n" +
087            "if the check fails."))
088    .put("-help",
089        new UsageInfo("<command>", "Displays help on the specified command"))
090    .build();
091
092  /** Output stream for errors, for use in tests */
093  protected PrintStream errOut = System.err;
094  protected PrintStream out = System.out;
095  private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
096
097  protected HAAdmin() {
098    super();
099  }
100
101  protected HAAdmin(Configuration conf) {
102    super(conf);
103  }
104
105  protected abstract HAServiceTarget resolveTarget(String string);
106  
107  protected Collection<String> getTargetIds(String targetNodeToActivate) {
108    return new ArrayList<String>(
109        Arrays.asList(new String[]{targetNodeToActivate}));
110  }
111
112  protected String getUsageString() {
113    return "Usage: HAAdmin";
114  }
115
116  protected void printUsage(PrintStream errOut) {
117    errOut.println(getUsageString());
118    for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
119      String cmd = e.getKey();
120      UsageInfo usage = e.getValue();
121      
122      errOut.println("    [" + cmd + " " + usage.args + "]"); 
123    }
124    errOut.println();
125    ToolRunner.printGenericCommandUsage(errOut);    
126  }
127  
128  private static void printUsage(PrintStream errOut, String cmd) {
129    UsageInfo usage = USAGE.get(cmd);
130    if (usage == null) {
131      throw new RuntimeException("No usage for cmd " + cmd);
132    }
133    errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]");
134  }
135
136  private int transitionToActive(final CommandLine cmd)
137      throws IOException, ServiceFailedException {
138    String[] argv = cmd.getArgs();
139    if (argv.length != 1) {
140      errOut.println("transitionToActive: incorrect number of arguments");
141      printUsage(errOut, "-transitionToActive");
142      return -1;
143    }
144    /*  returns true if other target node is active or some exception occurred 
145        and forceActive was not set  */
146    if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) {
147      return -1;
148    }
149    HAServiceTarget target = resolveTarget(argv[0]);
150    if (!checkManualStateManagementOK(target)) {
151      return -1;
152    }
153    HAServiceProtocol proto = target.getProxy(
154        getConf(), 0);
155    HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
156    return 0;
157  }
158  
159  /**
160   * Checks whether other target node is active or not
161   * @param targetNodeToActivate
162   * @return true if other target node is active or some other exception 
163   * occurred and forceActive was set otherwise false
164   * @throws IOException
165   */
166  private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive)
167      throws IOException  {
168    Collection<String> targetIds = getTargetIds(targetNodeToActivate);
169    if(targetIds == null) {
170      errOut.println("transitionToActive: No target node in the "
171          + "current configuration");
172      printUsage(errOut, "-transitionToActive");
173      return true;
174    }
175    targetIds.remove(targetNodeToActivate);
176    for(String targetId : targetIds) {
177      HAServiceTarget target = resolveTarget(targetId);
178      if (!checkManualStateManagementOK(target)) {
179        return true;
180      }
181      try {
182        HAServiceProtocol proto = target.getProxy(getConf(), 5000);
183        if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) {
184          errOut.println("transitionToActive: Node " +  targetId +" is already active");
185          printUsage(errOut, "-transitionToActive");
186          return true;
187        }
188      } catch (Exception e) {
189        //If forceActive switch is false then return true
190        if(!forceActive) {
191          errOut.println("Unexpected error occurred  " + e.getMessage());
192          printUsage(errOut, "-transitionToActive");
193          return true; 
194        }
195      }
196    }
197    return false;
198  }
199  
200  private int transitionToStandby(final CommandLine cmd)
201      throws IOException, ServiceFailedException {
202    String[] argv = cmd.getArgs();
203    if (argv.length != 1) {
204      errOut.println("transitionToStandby: incorrect number of arguments");
205      printUsage(errOut, "-transitionToStandby");
206      return -1;
207    }
208    
209    HAServiceTarget target = resolveTarget(argv[0]);
210    if (!checkManualStateManagementOK(target)) {
211      return -1;
212    }
213    HAServiceProtocol proto = target.getProxy(
214        getConf(), 0);
215    HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
216    return 0;
217  }
218  /**
219   * Ensure that we are allowed to manually manage the HA state of the target
220   * service. If automatic failover is configured, then the automatic
221   * failover controllers should be doing state management, and it is generally
222   * an error to use the HAAdmin command line to do so.
223   * 
224   * @param target the target to check
225   * @return true if manual state management is allowed
226   */
227  private boolean checkManualStateManagementOK(HAServiceTarget target) {
228    if (target.isAutoFailoverEnabled()) {
229      if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
230        errOut.println(
231            "Automatic failover is enabled for " + target + "\n" +
232            "Refusing to manually manage HA state, since it may cause\n" +
233            "a split-brain scenario or other incorrect state.\n" +
234            "If you are very sure you know what you are doing, please \n" +
235            "specify the " + FORCEMANUAL + " flag.");
236        return false;
237      } else {
238        LOG.warn("Proceeding with manual HA state management even though\n" +
239            "automatic failover is enabled for " + target);
240        return true;
241      }
242    }
243    return true;
244  }
245
246  private StateChangeRequestInfo createReqInfo() {
247    return new StateChangeRequestInfo(requestSource);
248  }
249
250  private int failover(CommandLine cmd)
251      throws IOException, ServiceFailedException {
252    boolean forceFence = cmd.hasOption(FORCEFENCE);
253    boolean forceActive = cmd.hasOption(FORCEACTIVE);
254
255    int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
256    final String[] args = cmd.getArgs();
257
258    if (numOpts > 3 || args.length != 2) {
259      errOut.println("failover: incorrect arguments");
260      printUsage(errOut, "-failover");
261      return -1;
262    }
263
264    HAServiceTarget fromNode = resolveTarget(args[0]);
265    HAServiceTarget toNode = resolveTarget(args[1]);
266    
267    // Check that auto-failover is consistently configured for both nodes.
268    Preconditions.checkState(
269        fromNode.isAutoFailoverEnabled() ==
270          toNode.isAutoFailoverEnabled(),
271          "Inconsistent auto-failover configs between %s and %s!",
272          fromNode, toNode);
273    
274    if (fromNode.isAutoFailoverEnabled()) {
275      if (forceFence || forceActive) {
276        // -forceActive doesn't make sense with auto-HA, since, if the node
277        // is not healthy, then its ZKFC will immediately quit the election
278        // again the next time a health check runs.
279        //
280        // -forceFence doesn't seem to have any real use cases with auto-HA
281        // so it isn't implemented.
282        errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
283            "supported with auto-failover enabled.");
284        return -1;
285      }
286      return gracefulFailoverThroughZKFCs(toNode);
287    }
288    
289    FailoverController fc = new FailoverController(getConf(),
290        requestSource);
291    
292    try {
293      fc.failover(fromNode, toNode, forceFence, forceActive); 
294      out.println("Failover from "+args[0]+" to "+args[1]+" successful");
295    } catch (FailoverFailedException ffe) {
296      errOut.println("Failover failed: " + ffe.getLocalizedMessage());
297      return -1;
298    }
299    return 0;
300  }
301  
302
303  /**
304   * Initiate a graceful failover by talking to the target node's ZKFC.
305   * This sends an RPC to the ZKFC, which coordinates the failover.
306   * 
307   * @param toNode the node to fail to
308   * @return status code (0 for success)
309   * @throws IOException if failover does not succeed
310   */
311  private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
312      throws IOException {
313
314    int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
315    ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
316    try {
317      proxy.gracefulFailover();
318      out.println("Failover to " + toNode + " successful");
319    } catch (ServiceFailedException sfe) {
320      errOut.println("Failover failed: " + sfe.getLocalizedMessage());
321      return -1;
322    }
323
324    return 0;
325  }
326
327  private int checkHealth(final CommandLine cmd)
328      throws IOException, ServiceFailedException {
329    String[] argv = cmd.getArgs();
330    if (argv.length != 1) {
331      errOut.println("checkHealth: incorrect number of arguments");
332      printUsage(errOut, "-checkHealth");
333      return -1;
334    }
335    HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
336        getConf(), rpcTimeoutForChecks);
337    try {
338      HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
339    } catch (HealthCheckFailedException e) {
340      errOut.println("Health check failed: " + e.getLocalizedMessage());
341      return -1;
342    }
343    return 0;
344  }
345
346  private int getServiceState(final CommandLine cmd)
347      throws IOException, ServiceFailedException {
348    String[] argv = cmd.getArgs();
349    if (argv.length != 1) {
350      errOut.println("getServiceState: incorrect number of arguments");
351      printUsage(errOut, "-getServiceState");
352      return -1;
353    }
354
355    HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
356        getConf(), rpcTimeoutForChecks);
357    out.println(proto.getServiceStatus().getState());
358    return 0;
359  }
360
361  /**
362   * Return the serviceId as is, we are assuming it was
363   * given as a service address of form <host:ipcport>.
364   */
365  protected String getServiceAddr(String serviceId) {
366    return serviceId;
367  }
368
369  @Override
370  public void setConf(Configuration conf) {
371    super.setConf(conf);
372    if (conf != null) {
373      rpcTimeoutForChecks = conf.getInt(
374          CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
375          CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
376    }
377  }
378
379  @Override
380  public int run(String[] argv) throws Exception {
381    try {
382      return runCmd(argv);
383    } catch (IllegalArgumentException iae) {
384      errOut.println("Illegal argument: " + iae.getLocalizedMessage());
385      return -1;
386    } catch (IOException ioe) {
387      errOut.println("Operation failed: " + ioe.getLocalizedMessage());
388      if (LOG.isDebugEnabled()) {
389        LOG.debug("Operation failed", ioe);
390      }
391      return -1;
392    }
393  }
394  
395  protected int runCmd(String[] argv) throws Exception {
396    if (argv.length < 1) {
397      printUsage(errOut);
398      return -1;
399    }
400
401    String cmd = argv[0];
402
403    if (!cmd.startsWith("-")) {
404      errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
405      printUsage(errOut);
406      return -1;
407    }
408    
409    if (!USAGE.containsKey(cmd)) {
410      errOut.println(cmd.substring(1) + ": Unknown command");
411      printUsage(errOut);
412      return -1;
413    }
414    
415    Options opts = new Options();
416
417    // Add command-specific options
418    if ("-failover".equals(cmd)) {
419      addFailoverCliOpts(opts);
420    }
421    if("-transitionToActive".equals(cmd)) {
422      addTransitionToActiveCliOpts(opts);
423    }
424    // Mutative commands take FORCEMANUAL option
425    if ("-transitionToActive".equals(cmd) ||
426        "-transitionToStandby".equals(cmd) ||
427        "-failover".equals(cmd)) {
428      opts.addOption(FORCEMANUAL, false,
429          "force manual control even if auto-failover is enabled");
430    }
431         
432    CommandLine cmdLine = parseOpts(cmd, opts, argv);
433    if (cmdLine == null) {
434      // error already printed
435      return -1;
436    }
437    
438    if (cmdLine.hasOption(FORCEMANUAL)) {
439      if (!confirmForceManual()) {
440        LOG.fatal("Aborted");
441        return -1;
442      }
443      // Instruct the NNs to honor this request even if they're
444      // configured for manual failover.
445      requestSource = RequestSource.REQUEST_BY_USER_FORCED;
446    }
447
448    if ("-transitionToActive".equals(cmd)) {
449      return transitionToActive(cmdLine);
450    } else if ("-transitionToStandby".equals(cmd)) {
451      return transitionToStandby(cmdLine);
452    } else if ("-failover".equals(cmd)) {
453      return failover(cmdLine);
454    } else if ("-getServiceState".equals(cmd)) {
455      return getServiceState(cmdLine);
456    } else if ("-checkHealth".equals(cmd)) {
457      return checkHealth(cmdLine);
458    } else if ("-help".equals(cmd)) {
459      return help(argv);
460    } else {
461      // we already checked command validity above, so getting here
462      // would be a coding error
463      throw new AssertionError("Should not get here, command: " + cmd);
464    } 
465  }
466  
467  private boolean confirmForceManual() throws IOException {
468     return ToolRunner.confirmPrompt(
469        "You have specified the " + FORCEMANUAL + " flag. This flag is " +
470        "dangerous, as it can induce a split-brain scenario that WILL " +
471        "CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
472        "\n" +
473        "It is recommended not to use this flag, but instead to shut down the " +
474        "cluster and disable automatic failover if you prefer to manually " +
475        "manage your HA state.\n" +
476        "\n" +
477        "You may abort safely by answering 'n' or hitting ^C now.\n" +
478        "\n" +
479        "Are you sure you want to continue?");
480  }
481
482  /**
483   * Add CLI options which are specific to the failover command and no
484   * others.
485   */
486  private void addFailoverCliOpts(Options failoverOpts) {
487    failoverOpts.addOption(FORCEFENCE, false, "force fencing");
488    failoverOpts.addOption(FORCEACTIVE, false, "force failover");
489    // Don't add FORCEMANUAL, since that's added separately for all commands
490    // that change state.
491  }
492  
493  /**
494   * Add CLI options which are specific to the transitionToActive command and
495   * no others.
496   */
497  private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) {
498    transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active");
499  }
500  
501  private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
502    try {
503      // Strip off the first arg, since that's just the command name
504      argv = Arrays.copyOfRange(argv, 1, argv.length); 
505      return new GnuParser().parse(opts, argv);
506    } catch (ParseException pe) {
507      errOut.println(cmdName.substring(1) +
508          ": incorrect arguments");
509      printUsage(errOut, cmdName);
510      return null;
511    }
512  }
513  
514  private int help(String[] argv) {
515    if (argv.length == 1) { // only -help
516      printUsage(out);
517      return 0;
518    } else if (argv.length != 2) {
519      printUsage(errOut, "-help");
520      return -1;
521    }
522    String cmd = argv[1];
523    if (!cmd.startsWith("-")) {
524      cmd = "-" + cmd;
525    }
526    UsageInfo usageInfo = USAGE.get(cmd);
527    if (usageInfo == null) {
528      errOut.println(cmd + ": Unknown command");
529      printUsage(errOut);
530      return -1;
531    }
532    
533    out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
534    return 0;
535  }
536  
537  protected static class UsageInfo {
538    public final String args;
539    public final String help;
540    
541    public UsageInfo(String args, String help) {
542      this.args = args;
543      this.help = help;
544    }
545  }
546}