001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.net;
019
020import java.util.ArrayList;
021import java.util.List;
022import java.util.Collection;
023import java.util.Collections;
024import java.util.List;
025import java.util.Random;
026import java.util.TreeMap;
027import java.util.concurrent.locks.ReadWriteLock;
028import java.util.concurrent.locks.ReentrantReadWriteLock;
029
030import com.google.common.annotations.VisibleForTesting;
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033import org.apache.hadoop.classification.InterfaceAudience;
034import org.apache.hadoop.classification.InterfaceStability;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
037import org.apache.hadoop.util.ReflectionUtils;
038
039import com.google.common.base.Preconditions;
040import com.google.common.collect.Lists;
041
042/** The class represents a cluster of computer with a tree hierarchical
043 * network topology.
044 * For example, a cluster may be consists of many data centers filled 
045 * with racks of computers.
046 * In a network topology, leaves represent data nodes (computers) and inner
047 * nodes represent switches/routers that manage traffic in/out of data centers
048 * or racks.  
049 * 
050 */
051@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
052@InterfaceStability.Unstable
053public class NetworkTopology {
054  public final static String DEFAULT_RACK = "/default-rack";
055  public final static int DEFAULT_HOST_LEVEL = 2;
056  public static final Log LOG =
057    LogFactory.getLog(NetworkTopology.class);
058
059  public static class InvalidTopologyException extends RuntimeException {
060    private static final long serialVersionUID = 1L;
061    public InvalidTopologyException(String msg) {
062      super(msg);
063    }
064  }
065  
066  /**
067   * Get an instance of NetworkTopology based on the value of the configuration
068   * parameter net.topology.impl.
069   * 
070   * @param conf the configuration to be used
071   * @return an instance of NetworkTopology
072   */
073  public static NetworkTopology getInstance(Configuration conf){
074    return ReflectionUtils.newInstance(
075        conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY,
076        NetworkTopology.class, NetworkTopology.class), conf);
077  }
078
079  /** InnerNode represents a switch/router of a data center or rack.
080   * Different from a leaf node, it has non-null children.
081   */
082  static class InnerNode extends NodeBase {
083    protected List<Node> children=new ArrayList<Node>();
084    private int numOfLeaves;
085        
086    /** Construct an InnerNode from a path-like string */
087    InnerNode(String path) {
088      super(path);
089    }
090        
091    /** Construct an InnerNode from its name and its network location */
092    InnerNode(String name, String location) {
093      super(name, location);
094    }
095        
096    /** Construct an InnerNode
097     * from its name, its network location, its parent, and its level */
098    InnerNode(String name, String location, InnerNode parent, int level) {
099      super(name, location, parent, level);
100    }
101        
102    /** @return its children */
103    List<Node> getChildren() {return children;}
104        
105    /** @return the number of children this node has */
106    int getNumOfChildren() {
107      return children.size();
108    }
109        
110    /** Judge if this node represents a rack 
111     * @return true if it has no child or its children are not InnerNodes
112     */ 
113    boolean isRack() {
114      if (children.isEmpty()) {
115        return true;
116      }
117            
118      Node firstChild = children.get(0);
119      if (firstChild instanceof InnerNode) {
120        return false;
121      }
122            
123      return true;
124    }
125        
126    /** Judge if this node is an ancestor of node <i>n</i>
127     * 
128     * @param n a node
129     * @return true if this node is an ancestor of <i>n</i>
130     */
131    boolean isAncestor(Node n) {
132      return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) ||
133        (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR).
134        startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR);
135    }
136        
137    /** Judge if this node is the parent of node <i>n</i>
138     * 
139     * @param n a node
140     * @return true if this node is the parent of <i>n</i>
141     */
142    boolean isParent(Node n) {
143      return n.getNetworkLocation().equals(getPath(this));
144    }
145        
146    /* Return a child name of this node who is an ancestor of node <i>n</i> */
147    private String getNextAncestorName(Node n) {
148      if (!isAncestor(n)) {
149        throw new IllegalArgumentException(
150                                           this + "is not an ancestor of " + n);
151      }
152      String name = n.getNetworkLocation().substring(getPath(this).length());
153      if (name.charAt(0) == PATH_SEPARATOR) {
154        name = name.substring(1);
155      }
156      int index=name.indexOf(PATH_SEPARATOR);
157      if (index !=-1)
158        name = name.substring(0, index);
159      return name;
160    }
161        
162    /** Add node <i>n</i> to the subtree of this node 
163     * @param n node to be added
164     * @return true if the node is added; false otherwise
165     */
166    boolean add(Node n) {
167      if (!isAncestor(n))
168        throw new IllegalArgumentException(n.getName()+", which is located at "
169                +n.getNetworkLocation()+", is not a decendent of "
170                +getPath(this));
171      if (isParent(n)) {
172        // this node is the parent of n; add n directly
173        n.setParent(this);
174        n.setLevel(this.level+1);
175        for(int i=0; i<children.size(); i++) {
176          if (children.get(i).getName().equals(n.getName())) {
177            children.set(i, n);
178            return false;
179          }
180        }
181        children.add(n);
182        numOfLeaves++;
183        return true;
184      } else {
185        // find the next ancestor node
186        String parentName = getNextAncestorName(n);
187        InnerNode parentNode = null;
188        for(int i=0; i<children.size(); i++) {
189          if (children.get(i).getName().equals(parentName)) {
190            parentNode = (InnerNode)children.get(i);
191            break;
192          }
193        }
194        if (parentNode == null) {
195          // create a new InnerNode
196          parentNode = createParentNode(parentName);
197          children.add(parentNode);
198        }
199        // add n to the subtree of the next ancestor node
200        if (parentNode.add(n)) {
201          numOfLeaves++;
202          return true;
203        } else {
204          return false;
205        }
206      }
207    }
208
209    /**
210     * Creates a parent node to be added to the list of children.  
211     * Creates a node using the InnerNode four argument constructor specifying 
212     * the name, location, parent, and level of this node.
213     * 
214     * <p>To be overridden in subclasses for specific InnerNode implementations,
215     * as alternative to overriding the full {@link #add(Node)} method.
216     * 
217     * @param parentName The name of the parent node
218     * @return A new inner node
219     * @see InnerNode#InnerNode(String, String, InnerNode, int)
220     */
221    protected InnerNode createParentNode(String parentName) {
222      return new InnerNode(parentName, getPath(this), this, this.getLevel()+1);
223    }
224
225    /** Remove node <i>n</i> from the subtree of this node
226     * @param n node to be deleted 
227     * @return true if the node is deleted; false otherwise
228     */
229    boolean remove(Node n) {
230      String parent = n.getNetworkLocation();
231      String currentPath = getPath(this);
232      if (!isAncestor(n))
233        throw new IllegalArgumentException(n.getName()
234                                           +", which is located at "
235                                           +parent+", is not a descendent of "+currentPath);
236      if (isParent(n)) {
237        // this node is the parent of n; remove n directly
238        for(int i=0; i<children.size(); i++) {
239          if (children.get(i).getName().equals(n.getName())) {
240            children.remove(i);
241            numOfLeaves--;
242            n.setParent(null);
243            return true;
244          }
245        }
246        return false;
247      } else {
248        // find the next ancestor node: the parent node
249        String parentName = getNextAncestorName(n);
250        InnerNode parentNode = null;
251        int i;
252        for(i=0; i<children.size(); i++) {
253          if (children.get(i).getName().equals(parentName)) {
254            parentNode = (InnerNode)children.get(i);
255            break;
256          }
257        }
258        if (parentNode==null) {
259          return false;
260        }
261        // remove n from the parent node
262        boolean isRemoved = parentNode.remove(n);
263        // if the parent node has no children, remove the parent node too
264        if (isRemoved) {
265          if (parentNode.getNumOfChildren() == 0) {
266            children.remove(i);
267          }
268          numOfLeaves--;
269        }
270        return isRemoved;
271      }
272    } // end of remove
273        
274    /** Given a node's string representation, return a reference to the node
275     * @param loc string location of the form /rack/node
276     * @return null if the node is not found or the childnode is there but
277     * not an instance of {@link InnerNode}
278     */
279    private Node getLoc(String loc) {
280      if (loc == null || loc.length() == 0) return this;
281            
282      String[] path = loc.split(PATH_SEPARATOR_STR, 2);
283      Node childnode = null;
284      for(int i=0; i<children.size(); i++) {
285        if (children.get(i).getName().equals(path[0])) {
286          childnode = children.get(i);
287        }
288      }
289      if (childnode == null) return null; // non-existing node
290      if (path.length == 1) return childnode;
291      if (childnode instanceof InnerNode) {
292        return ((InnerNode)childnode).getLoc(path[1]);
293      } else {
294        return null;
295      }
296    }
297        
298    /** get <i>leafIndex</i> leaf of this subtree 
299     * if it is not in the <i>excludedNode</i>
300     *
301     * @param leafIndex an indexed leaf of the node
302     * @param excludedNode an excluded node (can be null)
303     * @return
304     */
305    Node getLeaf(int leafIndex, Node excludedNode) {
306      int count=0;
307      // check if the excluded node a leaf
308      boolean isLeaf =
309        excludedNode == null || !(excludedNode instanceof InnerNode);
310      // calculate the total number of excluded leaf nodes
311      int numOfExcludedLeaves =
312        isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves();
313      if (isLeafParent()) { // children are leaves
314        if (isLeaf) { // excluded node is a leaf node
315          int excludedIndex = children.indexOf(excludedNode);
316          if (excludedIndex != -1 && leafIndex >= 0) {
317            // excluded node is one of the children so adjust the leaf index
318            leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex;
319          }
320        }
321        // range check
322        if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) {
323          return null;
324        }
325        return children.get(leafIndex);
326      } else {
327        for(int i=0; i<children.size(); i++) {
328          InnerNode child = (InnerNode)children.get(i);
329          if (excludedNode == null || excludedNode != child) {
330            // not the excludedNode
331            int numOfLeaves = child.getNumOfLeaves();
332            if (excludedNode != null && child.isAncestor(excludedNode)) {
333              numOfLeaves -= numOfExcludedLeaves;
334            }
335            if (count+numOfLeaves > leafIndex) {
336              // the leaf is in the child subtree
337              return child.getLeaf(leafIndex-count, excludedNode);
338            } else {
339              // go to the next child
340              count = count+numOfLeaves;
341            }
342          } else { // it is the excluededNode
343            // skip it and set the excludedNode to be null
344            excludedNode = null;
345          }
346        }
347        return null;
348      }
349    }
350    
351    protected boolean isLeafParent() {
352      return isRack();
353    }
354
355    /**
356      * Determine if children a leaves, default implementation calls {@link #isRack()}
357      * <p>To be overridden in subclasses for specific InnerNode implementations,
358      * as alternative to overriding the full {@link #getLeaf(int, Node)} method.
359      * 
360      * @return true if children are leaves, false otherwise
361      */
362    protected boolean areChildrenLeaves() {
363      return isRack();
364    }
365
366    /**
367     * Get number of leaves.
368     */
369    int getNumOfLeaves() {
370      return numOfLeaves;
371    }
372  } // end of InnerNode
373
374  /**
375   * the root cluster map
376   */
377  InnerNode clusterMap;
378  /** Depth of all leaf nodes */
379  private int depthOfAllLeaves = -1;
380  /** rack counter */
381  protected int numOfRacks = 0;
382
383  /**
384   * Whether or not this cluster has ever consisted of more than 1 rack,
385   * according to the NetworkTopology.
386   */
387  private boolean clusterEverBeenMultiRack = false;
388
389  /** the lock used to manage access */
390  protected ReadWriteLock netlock = new ReentrantReadWriteLock();
391
392  public NetworkTopology() {
393    clusterMap = new InnerNode(InnerNode.ROOT);
394  }
395
396  /** Add a leaf node
397   * Update node counter & rack counter if necessary
398   * @param node node to be added; can be null
399   * @exception IllegalArgumentException if add a node to a leave 
400                                         or node to be added is not a leaf
401   */
402  public void add(Node node) {
403    if (node==null) return;
404    int newDepth = NodeBase.locationToDepth(node.getNetworkLocation()) + 1;
405    netlock.writeLock().lock();
406    try {
407      String oldTopoStr = this.toString();
408      if( node instanceof InnerNode ) {
409        throw new IllegalArgumentException(
410          "Not allow to add an inner node: "+NodeBase.getPath(node));
411      }
412      if ((depthOfAllLeaves != -1) && (depthOfAllLeaves != newDepth)) {
413        LOG.error("Error: can't add leaf node " + NodeBase.getPath(node) +
414            " at depth " + newDepth + " to topology:\n" + oldTopoStr);
415        throw new InvalidTopologyException("Failed to add " + NodeBase.getPath(node) +
416            ": You cannot have a rack and a non-rack node at the same " +
417            "level of the network topology.");
418      }
419      Node rack = getNodeForNetworkLocation(node);
420      if (rack != null && !(rack instanceof InnerNode)) {
421        throw new IllegalArgumentException("Unexpected data node " 
422                                           + node.toString() 
423                                           + " at an illegal network location");
424      }
425      if (clusterMap.add(node)) {
426        LOG.info("Adding a new node: "+NodeBase.getPath(node));
427        if (rack == null) {
428          incrementRacks();
429        }
430        if (!(node instanceof InnerNode)) {
431          if (depthOfAllLeaves == -1) {
432            depthOfAllLeaves = node.getLevel();
433          }
434        }
435      }
436      if(LOG.isDebugEnabled()) {
437        LOG.debug("NetworkTopology became:\n" + this.toString());
438      }
439    } finally {
440      netlock.writeLock().unlock();
441    }
442  }
443
444  protected void incrementRacks() {
445    numOfRacks++;
446    if (!clusterEverBeenMultiRack && numOfRacks > 1) {
447      clusterEverBeenMultiRack = true;
448    }
449  }
450
451  /**
452   * Return a reference to the node given its string representation.
453   * Default implementation delegates to {@link #getNode(String)}.
454   * 
455   * <p>To be overridden in subclasses for specific NetworkTopology 
456   * implementations, as alternative to overriding the full {@link #add(Node)}
457   *  method.
458   * 
459   * @param node The string representation of this node's network location is
460   * used to retrieve a Node object. 
461   * @return a reference to the node; null if the node is not in the tree
462   * 
463   * @see #add(Node)
464   * @see #getNode(String)
465   */
466  protected Node getNodeForNetworkLocation(Node node) {
467    return getNode(node.getNetworkLocation());
468  }
469  
470  /**
471   * Given a string representation of a rack, return its children
472   * @param loc a path-like string representation of a rack
473   * @return a newly allocated list with all the node's children
474   */
475  public List<Node> getDatanodesInRack(String loc) {
476    netlock.readLock().lock();
477    try {
478      loc = NodeBase.normalize(loc);
479      if (!NodeBase.ROOT.equals(loc)) {
480        loc = loc.substring(1);
481      }
482      InnerNode rack = (InnerNode) clusterMap.getLoc(loc);
483      if (rack == null) {
484        return null;
485      }
486      return new ArrayList<Node>(rack.getChildren());
487    } finally {
488      netlock.readLock().unlock();
489    }
490  }
491
492  /** Remove a node
493   * Update node counter and rack counter if necessary
494   * @param node node to be removed; can be null
495   */ 
496  public void remove(Node node) {
497    if (node==null) return;
498    if( node instanceof InnerNode ) {
499      throw new IllegalArgumentException(
500        "Not allow to remove an inner node: "+NodeBase.getPath(node));
501    }
502    LOG.info("Removing a node: "+NodeBase.getPath(node));
503    netlock.writeLock().lock();
504    try {
505      if (clusterMap.remove(node)) {
506        InnerNode rack = (InnerNode)getNode(node.getNetworkLocation());
507        if (rack == null) {
508          numOfRacks--;
509        }
510      }
511      if(LOG.isDebugEnabled()) {
512        LOG.debug("NetworkTopology became:\n" + this.toString());
513      }
514    } finally {
515      netlock.writeLock().unlock();
516    }
517  }
518
519  /** Check if the tree contains node <i>node</i>
520   * 
521   * @param node a node
522   * @return true if <i>node</i> is already in the tree; false otherwise
523   */
524  public boolean contains(Node node) {
525    if (node == null) return false;
526    netlock.readLock().lock();
527    try {
528      Node parent = node.getParent();
529      for (int level = node.getLevel(); parent != null && level > 0;
530           parent = parent.getParent(), level--) {
531        if (parent == clusterMap) {
532          return true;
533        }
534      }
535    } finally {
536      netlock.readLock().unlock();
537    }
538    return false; 
539  }
540    
541  /** Given a string representation of a node, return its reference
542   * 
543   * @param loc
544   *          a path-like string representation of a node
545   * @return a reference to the node; null if the node is not in the tree
546   */
547  public Node getNode(String loc) {
548    netlock.readLock().lock();
549    try {
550      loc = NodeBase.normalize(loc);
551      if (!NodeBase.ROOT.equals(loc))
552        loc = loc.substring(1);
553      return clusterMap.getLoc(loc);
554    } finally {
555      netlock.readLock().unlock();
556    }
557  }
558
559  /**
560   * @return true if this cluster has ever consisted of multiple racks, even if
561   *         it is not now a multi-rack cluster.
562   */
563  public boolean hasClusterEverBeenMultiRack() {
564    return clusterEverBeenMultiRack;
565  }
566
567  /** Given a string representation of a rack for a specific network
568   *  location
569   *
570   * To be overridden in subclasses for specific NetworkTopology 
571   * implementations, as alternative to overriding the full 
572   * {@link #getRack(String)} method.
573   * @param loc
574   *          a path-like string representation of a network location
575   * @return a rack string
576   */
577  public String getRack(String loc) {
578    return loc;
579  }
580  
581  /** @return the total number of racks */
582  public int getNumOfRacks() {
583    netlock.readLock().lock();
584    try {
585      return numOfRacks;
586    } finally {
587      netlock.readLock().unlock();
588    }
589  }
590
591  /** @return the total number of leaf nodes */
592  public int getNumOfLeaves() {
593    netlock.readLock().lock();
594    try {
595      return clusterMap.getNumOfLeaves();
596    } finally {
597      netlock.readLock().unlock();
598    }
599  }
600
601  /** Return the distance between two nodes
602   * It is assumed that the distance from one node to its parent is 1
603   * The distance between two nodes is calculated by summing up their distances
604   * to their closest common ancestor.
605   * @param node1 one node
606   * @param node2 another node
607   * @return the distance between node1 and node2 which is zero if they are the same
608   *  or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster
609   */
610  public int getDistance(Node node1, Node node2) {
611    if (node1 == node2) {
612      return 0;
613    }
614    Node n1=node1, n2=node2;
615    int dis = 0;
616    netlock.readLock().lock();
617    try {
618      int level1=node1.getLevel(), level2=node2.getLevel();
619      while(n1!=null && level1>level2) {
620        n1 = n1.getParent();
621        level1--;
622        dis++;
623      }
624      while(n2!=null && level2>level1) {
625        n2 = n2.getParent();
626        level2--;
627        dis++;
628      }
629      while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) {
630        n1=n1.getParent();
631        n2=n2.getParent();
632        dis+=2;
633      }
634    } finally {
635      netlock.readLock().unlock();
636    }
637    if (n1==null) {
638      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1));
639      return Integer.MAX_VALUE;
640    }
641    if (n2==null) {
642      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2));
643      return Integer.MAX_VALUE;
644    }
645    return dis+2;
646  }
647
648  /** Check if two nodes are on the same rack
649   * @param node1 one node (can be null)
650   * @param node2 another node (can be null)
651   * @return true if node1 and node2 are on the same rack; false otherwise
652   * @exception IllegalArgumentException when either node1 or node2 is null, or
653   * node1 or node2 do not belong to the cluster
654   */
655  public boolean isOnSameRack( Node node1,  Node node2) {
656    if (node1 == null || node2 == null) {
657      return false;
658    }
659      
660    netlock.readLock().lock();
661    try {
662      return isSameParents(node1, node2);
663    } finally {
664      netlock.readLock().unlock();
665    }
666  }
667  
668  /**
669   * Check if network topology is aware of NodeGroup
670   */
671  public boolean isNodeGroupAware() {
672    return false;
673  }
674  
675  /** 
676   * Return false directly as not aware of NodeGroup, to be override in sub-class
677   */
678  public boolean isOnSameNodeGroup(Node node1, Node node2) {
679    return false;
680  }
681
682  /**
683   * Compare the parents of each node for equality
684   * 
685   * <p>To be overridden in subclasses for specific NetworkTopology 
686   * implementations, as alternative to overriding the full 
687   * {@link #isOnSameRack(Node, Node)} method.
688   * 
689   * @param node1 the first node to compare
690   * @param node2 the second node to compare
691   * @return true if their parents are equal, false otherwise
692   * 
693   * @see #isOnSameRack(Node, Node)
694   */
695  protected boolean isSameParents(Node node1, Node node2) {
696    return node1.getParent()==node2.getParent();
697  }
698
699  private static final Random r = new Random();
700
701  @VisibleForTesting
702  void setRandomSeed(long seed) {
703    r.setSeed(seed);
704  }
705
706  /** randomly choose one node from <i>scope</i>
707   * if scope starts with ~, choose one from the all nodes except for the
708   * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
709   * @param scope range of nodes from which a node will be chosen
710   * @return the chosen node
711   */
712  public Node chooseRandom(String scope) {
713    netlock.readLock().lock();
714    try {
715      if (scope.startsWith("~")) {
716        return chooseRandom(NodeBase.ROOT, scope.substring(1));
717      } else {
718        return chooseRandom(scope, null);
719      }
720    } finally {
721      netlock.readLock().unlock();
722    }
723  }
724
725  private Node chooseRandom(String scope, String excludedScope){
726    if (excludedScope != null) {
727      if (scope.startsWith(excludedScope)) {
728        return null;
729      }
730      if (!excludedScope.startsWith(scope)) {
731        excludedScope = null;
732      }
733    }
734    Node node = getNode(scope);
735    if (!(node instanceof InnerNode)) {
736      return node;
737    }
738    InnerNode innerNode = (InnerNode)node;
739    int numOfDatanodes = innerNode.getNumOfLeaves();
740    if (excludedScope == null) {
741      node = null;
742    } else {
743      node = getNode(excludedScope);
744      if (!(node instanceof InnerNode)) {
745        numOfDatanodes -= 1;
746      } else {
747        numOfDatanodes -= ((InnerNode)node).getNumOfLeaves();
748      }
749    }
750    if (numOfDatanodes == 0) {
751      throw new InvalidTopologyException(
752          "Failed to find datanode (scope=\"" + String.valueOf(scope) +
753          "\" excludedScope=\"" + String.valueOf(excludedScope) + "\").");
754    }
755    int leaveIndex = r.nextInt(numOfDatanodes);
756    return innerNode.getLeaf(leaveIndex, node);
757  }
758
759  /** return leaves in <i>scope</i>
760   * @param scope a path string
761   * @return leaves nodes under specific scope
762   */
763  public List<Node> getLeaves(String scope) {
764    Node node = getNode(scope);
765    List<Node> leafNodes = new ArrayList<Node>();
766    if (!(node instanceof InnerNode)) {
767      leafNodes.add(node);
768    } else {
769      InnerNode innerNode = (InnerNode) node;
770      for (int i=0;i<innerNode.getNumOfLeaves();i++) {
771        leafNodes.add(innerNode.getLeaf(i, null));
772      }
773    }
774    return leafNodes;
775  }
776
777  /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i>
778   * if scope starts with ~, return the number of nodes that are not
779   * in <i>scope</i> and <i>excludedNodes</i>; 
780   * @param scope a path string that may start with ~
781   * @param excludedNodes a list of nodes
782   * @return number of available nodes
783   */
784  public int countNumOfAvailableNodes(String scope,
785                                      Collection<Node> excludedNodes) {
786    boolean isExcluded=false;
787    if (scope.startsWith("~")) {
788      isExcluded=true;
789      scope=scope.substring(1);
790    }
791    scope = NodeBase.normalize(scope);
792    int excludedCountInScope = 0; // the number of nodes in both scope & excludedNodes
793    int excludedCountOffScope = 0; // the number of nodes outside scope & excludedNodes
794    netlock.readLock().lock();
795    try {
796      for (Node node : excludedNodes) {
797        node = getNode(NodeBase.getPath(node));
798        if (node == null) {
799          continue;
800        }
801        if ((NodeBase.getPath(node) + NodeBase.PATH_SEPARATOR_STR)
802            .startsWith(scope + NodeBase.PATH_SEPARATOR_STR)) {
803          excludedCountInScope++;
804        } else {
805          excludedCountOffScope++;
806        }
807      }
808      Node n = getNode(scope);
809      int scopeNodeCount = 0;
810      if (n != null) {
811        scopeNodeCount++;
812      }
813      if (n instanceof InnerNode) {
814        scopeNodeCount=((InnerNode)n).getNumOfLeaves();
815      }
816      if (isExcluded) {
817        return clusterMap.getNumOfLeaves() - scopeNodeCount
818            - excludedCountOffScope;
819      } else {
820        return scopeNodeCount - excludedCountInScope;
821      }
822    } finally {
823      netlock.readLock().unlock();
824    }
825  }
826
827  /** convert a network tree to a string */
828  @Override
829  public String toString() {
830    // print the number of racks
831    StringBuilder tree = new StringBuilder();
832    tree.append("Number of racks: ");
833    tree.append(numOfRacks);
834    tree.append("\n");
835    // print the number of leaves
836    int numOfLeaves = getNumOfLeaves();
837    tree.append("Expected number of leaves:");
838    tree.append(numOfLeaves);
839    tree.append("\n");
840    // print nodes
841    for(int i=0; i<numOfLeaves; i++) {
842      tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null)));
843      tree.append("\n");
844    }
845    return tree.toString();
846  }
847  
848  /**
849   * Divide networklocation string into two parts by last separator, and get 
850   * the first part here.
851   * 
852   * @param networkLocation
853   * @return
854   */
855  public static String getFirstHalf(String networkLocation) {
856    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
857    return networkLocation.substring(0, index);
858  }
859
860  /**
861   * Divide networklocation string into two parts by last separator, and get 
862   * the second part here.
863   * 
864   * @param networkLocation
865   * @return
866   */
867  public static String getLastHalf(String networkLocation) {
868    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
869    return networkLocation.substring(index);
870  }
871
872  /**
873   * Returns an integer weight which specifies how far away {node} is away from
874   * {reader}. A lower value signifies that a node is closer.
875   * 
876   * @param reader Node where data will be read
877   * @param node Replica of data
878   * @return weight
879   */
880  protected int getWeight(Node reader, Node node) {
881    // 0 is local, 1 is same rack, 2 is off rack
882    // Start off by initializing to off rack
883    int weight = 2;
884    if (reader != null) {
885      if (reader.equals(node)) {
886        weight = 0;
887      } else if (isOnSameRack(reader, node)) {
888        weight = 1;
889      }
890    }
891    return weight;
892  }
893
894  /**
895   * Sort nodes array by network distance to <i>reader</i>.
896   * <p/>
897   * In a three-level topology, a node can be either local, on the same rack,
898   * or on a different rack from the reader. Sorting the nodes based on network
899   * distance from the reader reduces network traffic and improves
900   * performance.
901   * <p/>
902   * As an additional twist, we also randomize the nodes at each network
903   * distance. This helps with load balancing when there is data skew.
904   *
905   * @param reader    Node where data will be read
906   * @param nodes     Available replicas with the requested data
907   * @param activeLen Number of active nodes at the front of the array
908   */
909  public void sortByDistance(Node reader, Node[] nodes, int activeLen) {
910    /** Sort weights for the nodes array */
911    int[] weights = new int[activeLen];
912    for (int i=0; i<activeLen; i++) {
913      weights[i] = getWeight(reader, nodes[i]);
914    }
915    // Add weight/node pairs to a TreeMap to sort
916    TreeMap<Integer, List<Node>> tree = new TreeMap<Integer, List<Node>>();
917    for (int i=0; i<activeLen; i++) {
918      int weight = weights[i];
919      Node node = nodes[i];
920      List<Node> list = tree.get(weight);
921      if (list == null) {
922        list = Lists.newArrayListWithExpectedSize(1);
923        tree.put(weight, list);
924      }
925      list.add(node);
926    }
927
928    int idx = 0;
929    for (List<Node> list: tree.values()) {
930      if (list != null) {
931        Collections.shuffle(list, r);
932        for (Node n: list) {
933          nodes[idx] = n;
934          idx++;
935        }
936      }
937    }
938    Preconditions.checkState(idx == activeLen,
939        "Sorted the wrong number of nodes!");
940  }
941}