001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.ha;
019    
020    import java.io.IOException;
021    import java.io.PrintStream;
022    import java.util.ArrayList;
023    import java.util.Arrays;
024    import java.util.Collection;
025    import java.util.Map;
026    
027    import org.apache.commons.cli.Options;
028    import org.apache.commons.cli.CommandLine;
029    import org.apache.commons.cli.GnuParser;
030    import org.apache.commons.cli.ParseException;
031    import org.apache.commons.logging.Log;
032    import org.apache.commons.logging.LogFactory;
033    
034    import org.apache.hadoop.classification.InterfaceAudience;
035    import org.apache.hadoop.conf.Configuration;
036    import org.apache.hadoop.conf.Configured;
037    import org.apache.hadoop.fs.CommonConfigurationKeys;
038    import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
039    import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
040    import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
041    import org.apache.hadoop.util.Tool;
042    import org.apache.hadoop.util.ToolRunner;
043    
044    import com.google.common.base.Preconditions;
045    import com.google.common.collect.ImmutableMap;
046    
047    /**
048     * A command-line tool for making calls in the HAServiceProtocol.
049     * For example,. this can be used to force a service to standby or active
050     * mode, or to trigger a health-check.
051     */
052    @InterfaceAudience.Private
053    
054    public abstract class HAAdmin extends Configured implements Tool {
055      
056      private static final String FORCEFENCE  = "forcefence";
057      private static final String FORCEACTIVE = "forceactive";
058      
059      /**
060       * Undocumented flag which allows an administrator to use manual failover
061       * state transitions even when auto-failover is enabled. This is an unsafe
062       * operation, which is why it is not documented in the usage below.
063       */
064      private static final String FORCEMANUAL = "forcemanual";
065      private static final Log LOG = LogFactory.getLog(HAAdmin.class);
066    
067      private int rpcTimeoutForChecks = -1;
068      
069      protected final static Map<String, UsageInfo> USAGE =
070        ImmutableMap.<String, UsageInfo>builder()
071        .put("-transitionToActive",
072            new UsageInfo("<serviceId> [--"+FORCEACTIVE+"]", "Transitions the service into Active state"))
073        .put("-transitionToStandby",
074            new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
075        .put("-failover",
076            new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
077                "Failover from the first service to the second.\n" +
078                "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
079                "Try to failover to the target service even if it is not ready if the " + 
080                FORCEACTIVE + " option is used."))
081        .put("-getServiceState",
082            new UsageInfo("<serviceId>", "Returns the state of the service"))
083        .put("-checkHealth",
084            new UsageInfo("<serviceId>",
085                "Requests that the service perform a health check.\n" + 
086                "The HAAdmin tool will exit with a non-zero exit code\n" +
087                "if the check fails."))
088        .put("-help",
089            new UsageInfo("<command>", "Displays help on the specified command"))
090        .build();
091    
092      /** Output stream for errors, for use in tests */
093      protected PrintStream errOut = System.err;
094      protected PrintStream out = System.out;
095      private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
096    
097      protected HAAdmin() {
098        super();
099      }
100    
101      protected HAAdmin(Configuration conf) {
102        super(conf);
103      }
104    
105      protected abstract HAServiceTarget resolveTarget(String string);
106      
107      protected Collection<String> getTargetIds(String targetNodeToActivate) {
108        return new ArrayList<String>(
109            Arrays.asList(new String[]{targetNodeToActivate}));
110      }
111    
112      protected String getUsageString() {
113        return "Usage: HAAdmin";
114      }
115    
116      protected void printUsage(PrintStream errOut) {
117        errOut.println(getUsageString());
118        for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
119          String cmd = e.getKey();
120          UsageInfo usage = e.getValue();
121          
122          errOut.println("    [" + cmd + " " + usage.args + "]"); 
123        }
124        errOut.println();
125        ToolRunner.printGenericCommandUsage(errOut);    
126      }
127      
128      private static void printUsage(PrintStream errOut, String cmd) {
129        UsageInfo usage = USAGE.get(cmd);
130        if (usage == null) {
131          throw new RuntimeException("No usage for cmd " + cmd);
132        }
133        errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]");
134      }
135    
136      private int transitionToActive(final CommandLine cmd)
137          throws IOException, ServiceFailedException {
138        String[] argv = cmd.getArgs();
139        if (argv.length != 1) {
140          errOut.println("transitionToActive: incorrect number of arguments");
141          printUsage(errOut, "-transitionToActive");
142          return -1;
143        }
144        /*  returns true if other target node is active or some exception occurred 
145            and forceActive was not set  */
146        if(!cmd.hasOption(FORCEACTIVE)) {
147          if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) {
148            return -1;
149          }
150        }
151        HAServiceTarget target = resolveTarget(argv[0]);
152        if (!checkManualStateManagementOK(target)) {
153          return -1;
154        }
155        HAServiceProtocol proto = target.getProxy(
156            getConf(), 0);
157        HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
158        return 0;
159      }
160      
161      /**
162       * Checks whether other target node is active or not
163       * @param targetNodeToActivate
164       * @return true if other target node is active or some other exception 
165       * occurred and forceActive was set otherwise false
166       * @throws IOException
167       */
168      private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive)
169          throws IOException  {
170        Collection<String> targetIds = getTargetIds(targetNodeToActivate);
171        if(targetIds == null) {
172          errOut.println("transitionToActive: No target node in the "
173              + "current configuration");
174          printUsage(errOut, "-transitionToActive");
175          return true;
176        }
177        targetIds.remove(targetNodeToActivate);
178        for(String targetId : targetIds) {
179          HAServiceTarget target = resolveTarget(targetId);
180          if (!checkManualStateManagementOK(target)) {
181            return true;
182          }
183          try {
184            HAServiceProtocol proto = target.getProxy(getConf(), 5000);
185            if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) {
186              errOut.println("transitionToActive: Node " +  targetId +" is already active");
187              printUsage(errOut, "-transitionToActive");
188              return true;
189            }
190          } catch (Exception e) {
191            //If forceActive switch is false then return true
192            if(!forceActive) {
193              errOut.println("Unexpected error occurred  " + e.getMessage());
194              printUsage(errOut, "-transitionToActive");
195              return true; 
196            }
197          }
198        }
199        return false;
200      }
201      
202      private int transitionToStandby(final CommandLine cmd)
203          throws IOException, ServiceFailedException {
204        String[] argv = cmd.getArgs();
205        if (argv.length != 1) {
206          errOut.println("transitionToStandby: incorrect number of arguments");
207          printUsage(errOut, "-transitionToStandby");
208          return -1;
209        }
210        
211        HAServiceTarget target = resolveTarget(argv[0]);
212        if (!checkManualStateManagementOK(target)) {
213          return -1;
214        }
215        HAServiceProtocol proto = target.getProxy(
216            getConf(), 0);
217        HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
218        return 0;
219      }
220      /**
221       * Ensure that we are allowed to manually manage the HA state of the target
222       * service. If automatic failover is configured, then the automatic
223       * failover controllers should be doing state management, and it is generally
224       * an error to use the HAAdmin command line to do so.
225       * 
226       * @param target the target to check
227       * @return true if manual state management is allowed
228       */
229      private boolean checkManualStateManagementOK(HAServiceTarget target) {
230        if (target.isAutoFailoverEnabled()) {
231          if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
232            errOut.println(
233                "Automatic failover is enabled for " + target + "\n" +
234                "Refusing to manually manage HA state, since it may cause\n" +
235                "a split-brain scenario or other incorrect state.\n" +
236                "If you are very sure you know what you are doing, please \n" +
237                "specify the " + FORCEMANUAL + " flag.");
238            return false;
239          } else {
240            LOG.warn("Proceeding with manual HA state management even though\n" +
241                "automatic failover is enabled for " + target);
242            return true;
243          }
244        }
245        return true;
246      }
247    
248      private StateChangeRequestInfo createReqInfo() {
249        return new StateChangeRequestInfo(requestSource);
250      }
251    
252      private int failover(CommandLine cmd)
253          throws IOException, ServiceFailedException {
254        boolean forceFence = cmd.hasOption(FORCEFENCE);
255        boolean forceActive = cmd.hasOption(FORCEACTIVE);
256    
257        int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
258        final String[] args = cmd.getArgs();
259    
260        if (numOpts > 3 || args.length != 2) {
261          errOut.println("failover: incorrect arguments");
262          printUsage(errOut, "-failover");
263          return -1;
264        }
265    
266        HAServiceTarget fromNode = resolveTarget(args[0]);
267        HAServiceTarget toNode = resolveTarget(args[1]);
268        
269        // Check that auto-failover is consistently configured for both nodes.
270        Preconditions.checkState(
271            fromNode.isAutoFailoverEnabled() ==
272              toNode.isAutoFailoverEnabled(),
273              "Inconsistent auto-failover configs between %s and %s!",
274              fromNode, toNode);
275        
276        if (fromNode.isAutoFailoverEnabled()) {
277          if (forceFence || forceActive) {
278            // -forceActive doesn't make sense with auto-HA, since, if the node
279            // is not healthy, then its ZKFC will immediately quit the election
280            // again the next time a health check runs.
281            //
282            // -forceFence doesn't seem to have any real use cases with auto-HA
283            // so it isn't implemented.
284            errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
285                "supported with auto-failover enabled.");
286            return -1;
287          }
288          return gracefulFailoverThroughZKFCs(toNode);
289        }
290        
291        FailoverController fc = new FailoverController(getConf(),
292            requestSource);
293        
294        try {
295          fc.failover(fromNode, toNode, forceFence, forceActive); 
296          out.println("Failover from "+args[0]+" to "+args[1]+" successful");
297        } catch (FailoverFailedException ffe) {
298          errOut.println("Failover failed: " + ffe.getLocalizedMessage());
299          return -1;
300        }
301        return 0;
302      }
303      
304    
305      /**
306       * Initiate a graceful failover by talking to the target node's ZKFC.
307       * This sends an RPC to the ZKFC, which coordinates the failover.
308       * 
309       * @param toNode the node to fail to
310       * @return status code (0 for success)
311       * @throws IOException if failover does not succeed
312       */
313      private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
314          throws IOException {
315    
316        int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
317        ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
318        try {
319          proxy.gracefulFailover();
320          out.println("Failover to " + toNode + " successful");
321        } catch (ServiceFailedException sfe) {
322          errOut.println("Failover failed: " + sfe.getLocalizedMessage());
323          return -1;
324        }
325    
326        return 0;
327      }
328    
329      private int checkHealth(final CommandLine cmd)
330          throws IOException, ServiceFailedException {
331        String[] argv = cmd.getArgs();
332        if (argv.length != 1) {
333          errOut.println("checkHealth: incorrect number of arguments");
334          printUsage(errOut, "-checkHealth");
335          return -1;
336        }
337        HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
338            getConf(), rpcTimeoutForChecks);
339        try {
340          HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
341        } catch (HealthCheckFailedException e) {
342          errOut.println("Health check failed: " + e.getLocalizedMessage());
343          return -1;
344        }
345        return 0;
346      }
347    
348      private int getServiceState(final CommandLine cmd)
349          throws IOException, ServiceFailedException {
350        String[] argv = cmd.getArgs();
351        if (argv.length != 1) {
352          errOut.println("getServiceState: incorrect number of arguments");
353          printUsage(errOut, "-getServiceState");
354          return -1;
355        }
356    
357        HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
358            getConf(), rpcTimeoutForChecks);
359        out.println(proto.getServiceStatus().getState());
360        return 0;
361      }
362    
363      /**
364       * Return the serviceId as is, we are assuming it was
365       * given as a service address of form <host:ipcport>.
366       */
367      protected String getServiceAddr(String serviceId) {
368        return serviceId;
369      }
370    
371      @Override
372      public void setConf(Configuration conf) {
373        super.setConf(conf);
374        if (conf != null) {
375          rpcTimeoutForChecks = conf.getInt(
376              CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
377              CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
378        }
379      }
380    
381      @Override
382      public int run(String[] argv) throws Exception {
383        try {
384          return runCmd(argv);
385        } catch (IllegalArgumentException iae) {
386          errOut.println("Illegal argument: " + iae.getLocalizedMessage());
387          return -1;
388        } catch (IOException ioe) {
389          errOut.println("Operation failed: " + ioe.getLocalizedMessage());
390          if (LOG.isDebugEnabled()) {
391            LOG.debug("Operation failed", ioe);
392          }
393          return -1;
394        }
395      }
396      
397      protected int runCmd(String[] argv) throws Exception {
398        if (argv.length < 1) {
399          printUsage(errOut);
400          return -1;
401        }
402    
403        String cmd = argv[0];
404    
405        if (!cmd.startsWith("-")) {
406          errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
407          printUsage(errOut);
408          return -1;
409        }
410        
411        if (!USAGE.containsKey(cmd)) {
412          errOut.println(cmd.substring(1) + ": Unknown command");
413          printUsage(errOut);
414          return -1;
415        }
416        
417        Options opts = new Options();
418    
419        // Add command-specific options
420        if ("-failover".equals(cmd)) {
421          addFailoverCliOpts(opts);
422        }
423        if("-transitionToActive".equals(cmd)) {
424          addTransitionToActiveCliOpts(opts);
425        }
426        // Mutative commands take FORCEMANUAL option
427        if ("-transitionToActive".equals(cmd) ||
428            "-transitionToStandby".equals(cmd) ||
429            "-failover".equals(cmd)) {
430          opts.addOption(FORCEMANUAL, false,
431              "force manual control even if auto-failover is enabled");
432        }
433             
434        CommandLine cmdLine = parseOpts(cmd, opts, argv);
435        if (cmdLine == null) {
436          // error already printed
437          return -1;
438        }
439        
440        if (cmdLine.hasOption(FORCEMANUAL)) {
441          if (!confirmForceManual()) {
442            LOG.fatal("Aborted");
443            return -1;
444          }
445          // Instruct the NNs to honor this request even if they're
446          // configured for manual failover.
447          requestSource = RequestSource.REQUEST_BY_USER_FORCED;
448        }
449    
450        if ("-transitionToActive".equals(cmd)) {
451          return transitionToActive(cmdLine);
452        } else if ("-transitionToStandby".equals(cmd)) {
453          return transitionToStandby(cmdLine);
454        } else if ("-failover".equals(cmd)) {
455          return failover(cmdLine);
456        } else if ("-getServiceState".equals(cmd)) {
457          return getServiceState(cmdLine);
458        } else if ("-checkHealth".equals(cmd)) {
459          return checkHealth(cmdLine);
460        } else if ("-help".equals(cmd)) {
461          return help(argv);
462        } else {
463          // we already checked command validity above, so getting here
464          // would be a coding error
465          throw new AssertionError("Should not get here, command: " + cmd);
466        } 
467      }
468      
469      private boolean confirmForceManual() throws IOException {
470         return ToolRunner.confirmPrompt(
471            "You have specified the " + FORCEMANUAL + " flag. This flag is " +
472            "dangerous, as it can induce a split-brain scenario that WILL " +
473            "CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
474            "\n" +
475            "It is recommended not to use this flag, but instead to shut down the " +
476            "cluster and disable automatic failover if you prefer to manually " +
477            "manage your HA state.\n" +
478            "\n" +
479            "You may abort safely by answering 'n' or hitting ^C now.\n" +
480            "\n" +
481            "Are you sure you want to continue?");
482      }
483    
484      /**
485       * Add CLI options which are specific to the failover command and no
486       * others.
487       */
488      private void addFailoverCliOpts(Options failoverOpts) {
489        failoverOpts.addOption(FORCEFENCE, false, "force fencing");
490        failoverOpts.addOption(FORCEACTIVE, false, "force failover");
491        // Don't add FORCEMANUAL, since that's added separately for all commands
492        // that change state.
493      }
494      
495      /**
496       * Add CLI options which are specific to the transitionToActive command and
497       * no others.
498       */
499      private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) {
500        transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active");
501      }
502      
503      private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
504        try {
505          // Strip off the first arg, since that's just the command name
506          argv = Arrays.copyOfRange(argv, 1, argv.length); 
507          return new GnuParser().parse(opts, argv);
508        } catch (ParseException pe) {
509          errOut.println(cmdName.substring(1) +
510              ": incorrect arguments");
511          printUsage(errOut, cmdName);
512          return null;
513        }
514      }
515      
516      private int help(String[] argv) {
517        if (argv.length == 1) { // only -help
518          printUsage(out);
519          return 0;
520        } else if (argv.length != 2) {
521          printUsage(errOut, "-help");
522          return -1;
523        }
524        String cmd = argv[1];
525        if (!cmd.startsWith("-")) {
526          cmd = "-" + cmd;
527        }
528        UsageInfo usageInfo = USAGE.get(cmd);
529        if (usageInfo == null) {
530          errOut.println(cmd + ": Unknown command");
531          printUsage(errOut);
532          return -1;
533        }
534        
535        out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
536        return 0;
537      }
538      
539      protected static class UsageInfo {
540        public final String args;
541        public final String help;
542        
543        public UsageInfo(String args, String help) {
544          this.args = args;
545          this.help = help;
546        }
547      }
548    }