001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.ha;
019    
020    import java.io.IOException;
021    import java.io.PrintStream;
022    import java.util.Arrays;
023    import java.util.Map;
024    
025    import org.apache.commons.cli.Options;
026    import org.apache.commons.cli.CommandLine;
027    import org.apache.commons.cli.GnuParser;
028    import org.apache.commons.cli.ParseException;
029    import org.apache.commons.logging.Log;
030    import org.apache.commons.logging.LogFactory;
031    
032    import org.apache.hadoop.classification.InterfaceAudience;
033    import org.apache.hadoop.conf.Configuration;
034    import org.apache.hadoop.conf.Configured;
035    import org.apache.hadoop.fs.CommonConfigurationKeys;
036    import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
037    import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
038    import org.apache.hadoop.util.Tool;
039    import org.apache.hadoop.util.ToolRunner;
040    
041    import com.google.common.base.Preconditions;
042    import com.google.common.collect.ImmutableMap;
043    
044    /**
045     * A command-line tool for making calls in the HAServiceProtocol.
046     * For example,. this can be used to force a service to standby or active
047     * mode, or to trigger a health-check.
048     */
049    @InterfaceAudience.Private
050    
051    public abstract class HAAdmin extends Configured implements Tool {
052      
053      private static final String FORCEFENCE  = "forcefence";
054      private static final String FORCEACTIVE = "forceactive";
055      
056      /**
057       * Undocumented flag which allows an administrator to use manual failover
058       * state transitions even when auto-failover is enabled. This is an unsafe
059       * operation, which is why it is not documented in the usage below.
060       */
061      private static final String FORCEMANUAL = "forcemanual";
062      private static final Log LOG = LogFactory.getLog(HAAdmin.class);
063    
064      private int rpcTimeoutForChecks = -1;
065      
066      protected final static Map<String, UsageInfo> USAGE =
067        ImmutableMap.<String, UsageInfo>builder()
068        .put("-transitionToActive",
069            new UsageInfo("<serviceId>", "Transitions the service into Active state"))
070        .put("-transitionToStandby",
071            new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
072        .put("-failover",
073            new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
074                "Failover from the first service to the second.\n" +
075                "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
076                "Try to failover to the target service even if it is not ready if the " + 
077                FORCEACTIVE + " option is used."))
078        .put("-getServiceState",
079            new UsageInfo("<serviceId>", "Returns the state of the service"))
080        .put("-checkHealth",
081            new UsageInfo("<serviceId>",
082                "Requests that the service perform a health check.\n" + 
083                "The HAAdmin tool will exit with a non-zero exit code\n" +
084                "if the check fails."))
085        .put("-help",
086            new UsageInfo("<command>", "Displays help on the specified command"))
087        .build();
088    
089      /** Output stream for errors, for use in tests */
090      protected PrintStream errOut = System.err;
091      protected PrintStream out = System.out;
092      private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
093    
094      protected HAAdmin() {
095        super();
096      }
097    
098      protected HAAdmin(Configuration conf) {
099        super(conf);
100      }
101    
102      protected abstract HAServiceTarget resolveTarget(String string);
103    
104      protected String getUsageString() {
105        return "Usage: HAAdmin";
106      }
107    
108      protected void printUsage(PrintStream errOut) {
109        errOut.println(getUsageString());
110        for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
111          String cmd = e.getKey();
112          UsageInfo usage = e.getValue();
113          
114          errOut.println("    [" + cmd + " " + usage.args + "]"); 
115        }
116        errOut.println();
117        ToolRunner.printGenericCommandUsage(errOut);    
118      }
119      
120      private static void printUsage(PrintStream errOut, String cmd) {
121        UsageInfo usage = USAGE.get(cmd);
122        if (usage == null) {
123          throw new RuntimeException("No usage for cmd " + cmd);
124        }
125        errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]");
126      }
127    
128      private int transitionToActive(final CommandLine cmd)
129          throws IOException, ServiceFailedException {
130        String[] argv = cmd.getArgs();
131        if (argv.length != 1) {
132          errOut.println("transitionToActive: incorrect number of arguments");
133          printUsage(errOut, "-transitionToActive");
134          return -1;
135        }
136        HAServiceTarget target = resolveTarget(argv[0]);
137        if (!checkManualStateManagementOK(target)) {
138          return -1;
139        }
140        HAServiceProtocol proto = target.getProxy(
141            getConf(), 0);
142        HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
143        return 0;
144      }
145    
146      private int transitionToStandby(final CommandLine cmd)
147          throws IOException, ServiceFailedException {
148        String[] argv = cmd.getArgs();
149        if (argv.length != 1) {
150          errOut.println("transitionToStandby: incorrect number of arguments");
151          printUsage(errOut, "-transitionToStandby");
152          return -1;
153        }
154        
155        HAServiceTarget target = resolveTarget(argv[0]);
156        if (!checkManualStateManagementOK(target)) {
157          return -1;
158        }
159        HAServiceProtocol proto = target.getProxy(
160            getConf(), 0);
161        HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
162        return 0;
163      }
164      /**
165       * Ensure that we are allowed to manually manage the HA state of the target
166       * service. If automatic failover is configured, then the automatic
167       * failover controllers should be doing state management, and it is generally
168       * an error to use the HAAdmin command line to do so.
169       * 
170       * @param target the target to check
171       * @return true if manual state management is allowed
172       */
173      private boolean checkManualStateManagementOK(HAServiceTarget target) {
174        if (target.isAutoFailoverEnabled()) {
175          if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
176            errOut.println(
177                "Automatic failover is enabled for " + target + "\n" +
178                "Refusing to manually manage HA state, since it may cause\n" +
179                "a split-brain scenario or other incorrect state.\n" +
180                "If you are very sure you know what you are doing, please \n" +
181                "specify the " + FORCEMANUAL + " flag.");
182            return false;
183          } else {
184            LOG.warn("Proceeding with manual HA state management even though\n" +
185                "automatic failover is enabled for " + target);
186            return true;
187          }
188        }
189        return true;
190      }
191    
192      private StateChangeRequestInfo createReqInfo() {
193        return new StateChangeRequestInfo(requestSource);
194      }
195    
196      private int failover(CommandLine cmd)
197          throws IOException, ServiceFailedException {
198        boolean forceFence = cmd.hasOption(FORCEFENCE);
199        boolean forceActive = cmd.hasOption(FORCEACTIVE);
200    
201        int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
202        final String[] args = cmd.getArgs();
203    
204        if (numOpts > 3 || args.length != 2) {
205          errOut.println("failover: incorrect arguments");
206          printUsage(errOut, "-failover");
207          return -1;
208        }
209    
210        HAServiceTarget fromNode = resolveTarget(args[0]);
211        HAServiceTarget toNode = resolveTarget(args[1]);
212        
213        // Check that auto-failover is consistently configured for both nodes.
214        Preconditions.checkState(
215            fromNode.isAutoFailoverEnabled() ==
216              toNode.isAutoFailoverEnabled(),
217              "Inconsistent auto-failover configs between %s and %s!",
218              fromNode, toNode);
219        
220        if (fromNode.isAutoFailoverEnabled()) {
221          if (forceFence || forceActive) {
222            // -forceActive doesn't make sense with auto-HA, since, if the node
223            // is not healthy, then its ZKFC will immediately quit the election
224            // again the next time a health check runs.
225            //
226            // -forceFence doesn't seem to have any real use cases with auto-HA
227            // so it isn't implemented.
228            errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
229                "supported with auto-failover enabled.");
230            return -1;
231          }
232          return gracefulFailoverThroughZKFCs(toNode);
233        }
234        
235        FailoverController fc = new FailoverController(getConf(),
236            requestSource);
237        
238        try {
239          fc.failover(fromNode, toNode, forceFence, forceActive); 
240          out.println("Failover from "+args[0]+" to "+args[1]+" successful");
241        } catch (FailoverFailedException ffe) {
242          errOut.println("Failover failed: " + ffe.getLocalizedMessage());
243          return -1;
244        }
245        return 0;
246      }
247      
248    
249      /**
250       * Initiate a graceful failover by talking to the target node's ZKFC.
251       * This sends an RPC to the ZKFC, which coordinates the failover.
252       * 
253       * @param toNode the node to fail to
254       * @return status code (0 for success)
255       * @throws IOException if failover does not succeed
256       */
257      private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
258          throws IOException {
259    
260        int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
261        ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
262        try {
263          proxy.gracefulFailover();
264          out.println("Failover to " + toNode + " successful");
265        } catch (ServiceFailedException sfe) {
266          errOut.println("Failover failed: " + sfe.getLocalizedMessage());
267          return -1;
268        }
269    
270        return 0;
271      }
272    
273      private int checkHealth(final CommandLine cmd)
274          throws IOException, ServiceFailedException {
275        String[] argv = cmd.getArgs();
276        if (argv.length != 1) {
277          errOut.println("checkHealth: incorrect number of arguments");
278          printUsage(errOut, "-checkHealth");
279          return -1;
280        }
281        HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
282            getConf(), rpcTimeoutForChecks);
283        try {
284          HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
285        } catch (HealthCheckFailedException e) {
286          errOut.println("Health check failed: " + e.getLocalizedMessage());
287          return -1;
288        }
289        return 0;
290      }
291    
292      private int getServiceState(final CommandLine cmd)
293          throws IOException, ServiceFailedException {
294        String[] argv = cmd.getArgs();
295        if (argv.length != 1) {
296          errOut.println("getServiceState: incorrect number of arguments");
297          printUsage(errOut, "-getServiceState");
298          return -1;
299        }
300    
301        HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
302            getConf(), rpcTimeoutForChecks);
303        out.println(proto.getServiceStatus().getState());
304        return 0;
305      }
306    
307      /**
308       * Return the serviceId as is, we are assuming it was
309       * given as a service address of form <host:ipcport>.
310       */
311      protected String getServiceAddr(String serviceId) {
312        return serviceId;
313      }
314    
315      @Override
316      public void setConf(Configuration conf) {
317        super.setConf(conf);
318        if (conf != null) {
319          rpcTimeoutForChecks = conf.getInt(
320              CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
321              CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
322        }
323      }
324    
325      @Override
326      public int run(String[] argv) throws Exception {
327        try {
328          return runCmd(argv);
329        } catch (IllegalArgumentException iae) {
330          errOut.println("Illegal argument: " + iae.getLocalizedMessage());
331          return -1;
332        } catch (IOException ioe) {
333          errOut.println("Operation failed: " + ioe.getLocalizedMessage());
334          if (LOG.isDebugEnabled()) {
335            LOG.debug("Operation failed", ioe);
336          }
337          return -1;
338        }
339      }
340      
341      protected int runCmd(String[] argv) throws Exception {
342        if (argv.length < 1) {
343          printUsage(errOut);
344          return -1;
345        }
346    
347        String cmd = argv[0];
348    
349        if (!cmd.startsWith("-")) {
350          errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
351          printUsage(errOut);
352          return -1;
353        }
354        
355        if (!USAGE.containsKey(cmd)) {
356          errOut.println(cmd.substring(1) + ": Unknown command");
357          printUsage(errOut);
358          return -1;
359        }
360        
361        Options opts = new Options();
362    
363        // Add command-specific options
364        if ("-failover".equals(cmd)) {
365          addFailoverCliOpts(opts);
366        }
367        // Mutative commands take FORCEMANUAL option
368        if ("-transitionToActive".equals(cmd) ||
369            "-transitionToStandby".equals(cmd) ||
370            "-failover".equals(cmd)) {
371          opts.addOption(FORCEMANUAL, false,
372              "force manual control even if auto-failover is enabled");
373        }
374             
375        CommandLine cmdLine = parseOpts(cmd, opts, argv);
376        if (cmdLine == null) {
377          // error already printed
378          return -1;
379        }
380        
381        if (cmdLine.hasOption(FORCEMANUAL)) {
382          if (!confirmForceManual()) {
383            LOG.fatal("Aborted");
384            return -1;
385          }
386          // Instruct the NNs to honor this request even if they're
387          // configured for manual failover.
388          requestSource = RequestSource.REQUEST_BY_USER_FORCED;
389        }
390    
391        if ("-transitionToActive".equals(cmd)) {
392          return transitionToActive(cmdLine);
393        } else if ("-transitionToStandby".equals(cmd)) {
394          return transitionToStandby(cmdLine);
395        } else if ("-failover".equals(cmd)) {
396          return failover(cmdLine);
397        } else if ("-getServiceState".equals(cmd)) {
398          return getServiceState(cmdLine);
399        } else if ("-checkHealth".equals(cmd)) {
400          return checkHealth(cmdLine);
401        } else if ("-help".equals(cmd)) {
402          return help(argv);
403        } else {
404          // we already checked command validity above, so getting here
405          // would be a coding error
406          throw new AssertionError("Should not get here, command: " + cmd);
407        } 
408      }
409      
410      private boolean confirmForceManual() throws IOException {
411         return ToolRunner.confirmPrompt(
412            "You have specified the " + FORCEMANUAL + " flag. This flag is " +
413            "dangerous, as it can induce a split-brain scenario that WILL " +
414            "CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
415            "\n" +
416            "It is recommended not to use this flag, but instead to shut down the " +
417            "cluster and disable automatic failover if you prefer to manually " +
418            "manage your HA state.\n" +
419            "\n" +
420            "You may abort safely by answering 'n' or hitting ^C now.\n" +
421            "\n" +
422            "Are you sure you want to continue?");
423      }
424    
425      /**
426       * Add CLI options which are specific to the failover command and no
427       * others.
428       */
429      private void addFailoverCliOpts(Options failoverOpts) {
430        failoverOpts.addOption(FORCEFENCE, false, "force fencing");
431        failoverOpts.addOption(FORCEACTIVE, false, "force failover");
432        // Don't add FORCEMANUAL, since that's added separately for all commands
433        // that change state.
434      }
435      
436      private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
437        try {
438          // Strip off the first arg, since that's just the command name
439          argv = Arrays.copyOfRange(argv, 1, argv.length); 
440          return new GnuParser().parse(opts, argv);
441        } catch (ParseException pe) {
442          errOut.println(cmdName.substring(1) +
443              ": incorrect arguments");
444          printUsage(errOut, cmdName);
445          return null;
446        }
447      }
448      
449      private int help(String[] argv) {
450        if (argv.length == 1) { // only -help
451          printUsage(out);
452          return 0;
453        } else if (argv.length != 2) {
454          printUsage(errOut, "-help");
455          return -1;
456        }
457        String cmd = argv[1];
458        if (!cmd.startsWith("-")) {
459          cmd = "-" + cmd;
460        }
461        UsageInfo usageInfo = USAGE.get(cmd);
462        if (usageInfo == null) {
463          errOut.println(cmd + ": Unknown command");
464          printUsage(errOut);
465          return -1;
466        }
467        
468        out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
469        return 0;
470      }
471      
472      protected static class UsageInfo {
473        public final String args;
474        public final String help;
475        
476        public UsageInfo(String args, String help) {
477          this.args = args;
478          this.help = help;
479        }
480      }
481    }