001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.net;
019
020import java.util.ArrayList;
021import java.util.HashMap;
022import java.util.Collection;
023import java.util.Collections;
024import java.util.List;
025import java.util.Map;
026import java.util.Random;
027import java.util.TreeMap;
028import java.util.concurrent.locks.ReadWriteLock;
029import java.util.concurrent.locks.ReentrantReadWriteLock;
030
031import com.google.common.annotations.VisibleForTesting;
032import org.apache.commons.logging.Log;
033import org.apache.commons.logging.LogFactory;
034import org.apache.hadoop.classification.InterfaceAudience;
035import org.apache.hadoop.classification.InterfaceStability;
036import org.apache.hadoop.conf.Configuration;
037import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
038import org.apache.hadoop.util.ReflectionUtils;
039
040import com.google.common.base.Preconditions;
041import com.google.common.collect.Lists;
042
043/** The class represents a cluster of computer with a tree hierarchical
044 * network topology.
045 * For example, a cluster may be consists of many data centers filled 
046 * with racks of computers.
047 * In a network topology, leaves represent data nodes (computers) and inner
048 * nodes represent switches/routers that manage traffic in/out of data centers
049 * or racks.  
050 * 
051 */
052@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
053@InterfaceStability.Unstable
054public class NetworkTopology {
055  public final static String DEFAULT_RACK = "/default-rack";
056  public final static int DEFAULT_HOST_LEVEL = 2;
057  public static final Log LOG =
058    LogFactory.getLog(NetworkTopology.class);
059
060  public static class InvalidTopologyException extends RuntimeException {
061    private static final long serialVersionUID = 1L;
062    public InvalidTopologyException(String msg) {
063      super(msg);
064    }
065  }
066  
067  /**
068   * Get an instance of NetworkTopology based on the value of the configuration
069   * parameter net.topology.impl.
070   * 
071   * @param conf the configuration to be used
072   * @return an instance of NetworkTopology
073   */
074  public static NetworkTopology getInstance(Configuration conf){
075    return ReflectionUtils.newInstance(
076        conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY,
077        NetworkTopology.class, NetworkTopology.class), conf);
078  }
079
080  /** InnerNode represents a switch/router of a data center or rack.
081   * Different from a leaf node, it has non-null children.
082   */
083  static class InnerNode extends NodeBase {
084    protected List<Node> children=new ArrayList<Node>();
085    private Map<String, Node> childrenMap = new HashMap<String, Node>();
086    private int numOfLeaves;
087        
088    /** Construct an InnerNode from a path-like string */
089    InnerNode(String path) {
090      super(path);
091    }
092        
093    /** Construct an InnerNode from its name and its network location */
094    InnerNode(String name, String location) {
095      super(name, location);
096    }
097        
098    /** Construct an InnerNode
099     * from its name, its network location, its parent, and its level */
100    InnerNode(String name, String location, InnerNode parent, int level) {
101      super(name, location, parent, level);
102    }
103        
104    /** @return its children */
105    List<Node> getChildren() {return children;}
106        
107    /** @return the number of children this node has */
108    int getNumOfChildren() {
109      return children.size();
110    }
111        
112    /** Judge if this node represents a rack 
113     * @return true if it has no child or its children are not InnerNodes
114     */ 
115    boolean isRack() {
116      if (children.isEmpty()) {
117        return true;
118      }
119            
120      Node firstChild = children.get(0);
121      if (firstChild instanceof InnerNode) {
122        return false;
123      }
124            
125      return true;
126    }
127        
128    /** Judge if this node is an ancestor of node <i>n</i>
129     * 
130     * @param n a node
131     * @return true if this node is an ancestor of <i>n</i>
132     */
133    boolean isAncestor(Node n) {
134      return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) ||
135        (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR).
136        startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR);
137    }
138        
139    /** Judge if this node is the parent of node <i>n</i>
140     * 
141     * @param n a node
142     * @return true if this node is the parent of <i>n</i>
143     */
144    boolean isParent(Node n) {
145      return n.getNetworkLocation().equals(getPath(this));
146    }
147        
148    /* Return a child name of this node who is an ancestor of node <i>n</i> */
149    private String getNextAncestorName(Node n) {
150      if (!isAncestor(n)) {
151        throw new IllegalArgumentException(
152                                           this + "is not an ancestor of " + n);
153      }
154      String name = n.getNetworkLocation().substring(getPath(this).length());
155      if (name.charAt(0) == PATH_SEPARATOR) {
156        name = name.substring(1);
157      }
158      int index=name.indexOf(PATH_SEPARATOR);
159      if (index !=-1)
160        name = name.substring(0, index);
161      return name;
162    }
163        
164    /** Add node <i>n</i> to the subtree of this node 
165     * @param n node to be added
166     * @return true if the node is added; false otherwise
167     */
168    boolean add(Node n) {
169      if (!isAncestor(n))
170        throw new IllegalArgumentException(n.getName()+", which is located at "
171                +n.getNetworkLocation()+", is not a decendent of "
172                +getPath(this));
173      if (isParent(n)) {
174        // this node is the parent of n; add n directly
175        n.setParent(this);
176        n.setLevel(this.level+1);
177        Node prev = childrenMap.put(n.getName(), n);
178        if (prev != null) {
179          for(int i=0; i<children.size(); i++) {
180            if (children.get(i).getName().equals(n.getName())) {
181              children.set(i, n);
182              return false;
183            }
184          }
185        }
186        children.add(n);
187        numOfLeaves++;
188        return true;
189      } else {
190        // find the next ancestor node
191        String parentName = getNextAncestorName(n);
192        InnerNode parentNode = (InnerNode)childrenMap.get(parentName);
193        if (parentNode == null) {
194          // create a new InnerNode
195          parentNode = createParentNode(parentName);
196          children.add(parentNode);
197          childrenMap.put(parentNode.getName(), parentNode);
198        }
199        // add n to the subtree of the next ancestor node
200        if (parentNode.add(n)) {
201          numOfLeaves++;
202          return true;
203        } else {
204          return false;
205        }
206      }
207    }
208
209    /**
210     * Creates a parent node to be added to the list of children.  
211     * Creates a node using the InnerNode four argument constructor specifying 
212     * the name, location, parent, and level of this node.
213     * 
214     * <p>To be overridden in subclasses for specific InnerNode implementations,
215     * as alternative to overriding the full {@link #add(Node)} method.
216     * 
217     * @param parentName The name of the parent node
218     * @return A new inner node
219     * @see InnerNode#InnerNode(String, String, InnerNode, int)
220     */
221    protected InnerNode createParentNode(String parentName) {
222      return new InnerNode(parentName, getPath(this), this, this.getLevel()+1);
223    }
224
225    /** Remove node <i>n</i> from the subtree of this node
226     * @param n node to be deleted 
227     * @return true if the node is deleted; false otherwise
228     */
229    boolean remove(Node n) {
230      String parent = n.getNetworkLocation();
231      String currentPath = getPath(this);
232      if (!isAncestor(n))
233        throw new IllegalArgumentException(n.getName()
234                                           +", which is located at "
235                                           +parent+", is not a descendent of "+currentPath);
236      if (isParent(n)) {
237        // this node is the parent of n; remove n directly
238        if (childrenMap.containsKey(n.getName())) {
239          for (int i=0; i<children.size(); i++) {
240            if (children.get(i).getName().equals(n.getName())) {
241              children.remove(i);
242              childrenMap.remove(n.getName());
243              numOfLeaves--;
244              n.setParent(null);
245              return true;
246            }
247          }
248        }
249        return false;
250      } else {
251        // find the next ancestor node: the parent node
252        String parentName = getNextAncestorName(n);
253        InnerNode parentNode = null;
254        int i;
255        for(i=0; i<children.size(); i++) {
256          if (children.get(i).getName().equals(parentName)) {
257            parentNode = (InnerNode)children.get(i);
258            break;
259          }
260        }
261        if (parentNode==null) {
262          return false;
263        }
264        // remove n from the parent node
265        boolean isRemoved = parentNode.remove(n);
266        // if the parent node has no children, remove the parent node too
267        if (isRemoved) {
268          if (parentNode.getNumOfChildren() == 0) {
269            Node prev = children.remove(i);
270            childrenMap.remove(prev.getName());
271          }
272          numOfLeaves--;
273        }
274        return isRemoved;
275      }
276    } // end of remove
277        
278    /** Given a node's string representation, return a reference to the node
279     * @param loc string location of the form /rack/node
280     * @return null if the node is not found or the childnode is there but
281     * not an instance of {@link InnerNode}
282     */
283    private Node getLoc(String loc) {
284      if (loc == null || loc.length() == 0) return this;
285            
286      String[] path = loc.split(PATH_SEPARATOR_STR, 2);
287      Node childnode = childrenMap.get(path[0]);
288      if (childnode == null) return null; // non-existing node
289      if (path.length == 1) return childnode;
290      if (childnode instanceof InnerNode) {
291        return ((InnerNode)childnode).getLoc(path[1]);
292      } else {
293        return null;
294      }
295    }
296        
297    /** get <i>leafIndex</i> leaf of this subtree 
298     * if it is not in the <i>excludedNode</i>
299     *
300     * @param leafIndex an indexed leaf of the node
301     * @param excludedNode an excluded node (can be null)
302     * @return
303     */
304    Node getLeaf(int leafIndex, Node excludedNode) {
305      int count=0;
306      // check if the excluded node a leaf
307      boolean isLeaf =
308        excludedNode == null || !(excludedNode instanceof InnerNode);
309      // calculate the total number of excluded leaf nodes
310      int numOfExcludedLeaves =
311        isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves();
312      if (isLeafParent()) { // children are leaves
313        if (isLeaf) { // excluded node is a leaf node
314          if (excludedNode != null &&
315              childrenMap.containsKey(excludedNode.getName())) {
316            int excludedIndex = children.indexOf(excludedNode);
317            if (excludedIndex != -1 && leafIndex >= 0) {
318              // excluded node is one of the children so adjust the leaf index
319              leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex;
320            }
321          }
322        }
323        // range check
324        if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) {
325          return null;
326        }
327        return children.get(leafIndex);
328      } else {
329        for(int i=0; i<children.size(); i++) {
330          InnerNode child = (InnerNode)children.get(i);
331          if (excludedNode == null || excludedNode != child) {
332            // not the excludedNode
333            int numOfLeaves = child.getNumOfLeaves();
334            if (excludedNode != null && child.isAncestor(excludedNode)) {
335              numOfLeaves -= numOfExcludedLeaves;
336            }
337            if (count+numOfLeaves > leafIndex) {
338              // the leaf is in the child subtree
339              return child.getLeaf(leafIndex-count, excludedNode);
340            } else {
341              // go to the next child
342              count = count+numOfLeaves;
343            }
344          } else { // it is the excluededNode
345            // skip it and set the excludedNode to be null
346            excludedNode = null;
347          }
348        }
349        return null;
350      }
351    }
352    
353    protected boolean isLeafParent() {
354      return isRack();
355    }
356
357    /**
358      * Determine if children a leaves, default implementation calls {@link #isRack()}
359      * <p>To be overridden in subclasses for specific InnerNode implementations,
360      * as alternative to overriding the full {@link #getLeaf(int, Node)} method.
361      * 
362      * @return true if children are leaves, false otherwise
363      */
364    protected boolean areChildrenLeaves() {
365      return isRack();
366    }
367
368    /**
369     * Get number of leaves.
370     */
371    int getNumOfLeaves() {
372      return numOfLeaves;
373    }
374  } // end of InnerNode
375
376  /**
377   * the root cluster map
378   */
379  InnerNode clusterMap;
380  /** Depth of all leaf nodes */
381  private int depthOfAllLeaves = -1;
382  /** rack counter */
383  protected int numOfRacks = 0;
384
385  /**
386   * Whether or not this cluster has ever consisted of more than 1 rack,
387   * according to the NetworkTopology.
388   */
389  private boolean clusterEverBeenMultiRack = false;
390
391  /** the lock used to manage access */
392  protected ReadWriteLock netlock = new ReentrantReadWriteLock();
393
394  public NetworkTopology() {
395    clusterMap = new InnerNode(InnerNode.ROOT);
396  }
397
398  /** Add a leaf node
399   * Update node counter & rack counter if necessary
400   * @param node node to be added; can be null
401   * @exception IllegalArgumentException if add a node to a leave 
402                                         or node to be added is not a leaf
403   */
404  public void add(Node node) {
405    if (node==null) return;
406    int newDepth = NodeBase.locationToDepth(node.getNetworkLocation()) + 1;
407    netlock.writeLock().lock();
408    try {
409      if( node instanceof InnerNode ) {
410        throw new IllegalArgumentException(
411          "Not allow to add an inner node: "+NodeBase.getPath(node));
412      }
413      if ((depthOfAllLeaves != -1) && (depthOfAllLeaves != newDepth)) {
414        LOG.error("Error: can't add leaf node " + NodeBase.getPath(node) +
415            " at depth " + newDepth + " to topology:\n" + this.toString());
416        throw new InvalidTopologyException("Failed to add " + NodeBase.getPath(node) +
417            ": You cannot have a rack and a non-rack node at the same " +
418            "level of the network topology.");
419      }
420      Node rack = getNodeForNetworkLocation(node);
421      if (rack != null && !(rack instanceof InnerNode)) {
422        throw new IllegalArgumentException("Unexpected data node " 
423                                           + node.toString() 
424                                           + " at an illegal network location");
425      }
426      if (clusterMap.add(node)) {
427        LOG.info("Adding a new node: "+NodeBase.getPath(node));
428        if (rack == null) {
429          incrementRacks();
430        }
431        if (!(node instanceof InnerNode)) {
432          if (depthOfAllLeaves == -1) {
433            depthOfAllLeaves = node.getLevel();
434          }
435        }
436      }
437      if(LOG.isDebugEnabled()) {
438        LOG.debug("NetworkTopology became:\n" + this.toString());
439      }
440    } finally {
441      netlock.writeLock().unlock();
442    }
443  }
444
445  protected void incrementRacks() {
446    numOfRacks++;
447    if (!clusterEverBeenMultiRack && numOfRacks > 1) {
448      clusterEverBeenMultiRack = true;
449    }
450  }
451
452  /**
453   * Return a reference to the node given its string representation.
454   * Default implementation delegates to {@link #getNode(String)}.
455   * 
456   * <p>To be overridden in subclasses for specific NetworkTopology 
457   * implementations, as alternative to overriding the full {@link #add(Node)}
458   *  method.
459   * 
460   * @param node The string representation of this node's network location is
461   * used to retrieve a Node object. 
462   * @return a reference to the node; null if the node is not in the tree
463   * 
464   * @see #add(Node)
465   * @see #getNode(String)
466   */
467  protected Node getNodeForNetworkLocation(Node node) {
468    return getNode(node.getNetworkLocation());
469  }
470  
471  /**
472   * Given a string representation of a rack, return its children
473   * @param loc a path-like string representation of a rack
474   * @return a newly allocated list with all the node's children
475   */
476  public List<Node> getDatanodesInRack(String loc) {
477    netlock.readLock().lock();
478    try {
479      loc = NodeBase.normalize(loc);
480      if (!NodeBase.ROOT.equals(loc)) {
481        loc = loc.substring(1);
482      }
483      InnerNode rack = (InnerNode) clusterMap.getLoc(loc);
484      if (rack == null) {
485        return null;
486      }
487      return new ArrayList<Node>(rack.getChildren());
488    } finally {
489      netlock.readLock().unlock();
490    }
491  }
492
493  /** Remove a node
494   * Update node counter and rack counter if necessary
495   * @param node node to be removed; can be null
496   */ 
497  public void remove(Node node) {
498    if (node==null) return;
499    if( node instanceof InnerNode ) {
500      throw new IllegalArgumentException(
501        "Not allow to remove an inner node: "+NodeBase.getPath(node));
502    }
503    LOG.info("Removing a node: "+NodeBase.getPath(node));
504    netlock.writeLock().lock();
505    try {
506      if (clusterMap.remove(node)) {
507        InnerNode rack = (InnerNode)getNode(node.getNetworkLocation());
508        if (rack == null) {
509          numOfRacks--;
510        }
511      }
512      if(LOG.isDebugEnabled()) {
513        LOG.debug("NetworkTopology became:\n" + this.toString());
514      }
515    } finally {
516      netlock.writeLock().unlock();
517    }
518  }
519
520  /** Check if the tree contains node <i>node</i>
521   * 
522   * @param node a node
523   * @return true if <i>node</i> is already in the tree; false otherwise
524   */
525  public boolean contains(Node node) {
526    if (node == null) return false;
527    netlock.readLock().lock();
528    try {
529      Node parent = node.getParent();
530      for (int level = node.getLevel(); parent != null && level > 0;
531           parent = parent.getParent(), level--) {
532        if (parent == clusterMap) {
533          return true;
534        }
535      }
536    } finally {
537      netlock.readLock().unlock();
538    }
539    return false; 
540  }
541    
542  /** Given a string representation of a node, return its reference
543   * 
544   * @param loc
545   *          a path-like string representation of a node
546   * @return a reference to the node; null if the node is not in the tree
547   */
548  public Node getNode(String loc) {
549    netlock.readLock().lock();
550    try {
551      loc = NodeBase.normalize(loc);
552      if (!NodeBase.ROOT.equals(loc))
553        loc = loc.substring(1);
554      return clusterMap.getLoc(loc);
555    } finally {
556      netlock.readLock().unlock();
557    }
558  }
559
560  /**
561   * @return true if this cluster has ever consisted of multiple racks, even if
562   *         it is not now a multi-rack cluster.
563   */
564  public boolean hasClusterEverBeenMultiRack() {
565    return clusterEverBeenMultiRack;
566  }
567
568  /** Given a string representation of a rack for a specific network
569   *  location
570   *
571   * To be overridden in subclasses for specific NetworkTopology 
572   * implementations, as alternative to overriding the full 
573   * {@link #getRack(String)} method.
574   * @param loc
575   *          a path-like string representation of a network location
576   * @return a rack string
577   */
578  public String getRack(String loc) {
579    return loc;
580  }
581  
582  /** @return the total number of racks */
583  public int getNumOfRacks() {
584    netlock.readLock().lock();
585    try {
586      return numOfRacks;
587    } finally {
588      netlock.readLock().unlock();
589    }
590  }
591
592  /** @return the total number of leaf nodes */
593  public int getNumOfLeaves() {
594    netlock.readLock().lock();
595    try {
596      return clusterMap.getNumOfLeaves();
597    } finally {
598      netlock.readLock().unlock();
599    }
600  }
601
602  /** Return the distance between two nodes
603   * It is assumed that the distance from one node to its parent is 1
604   * The distance between two nodes is calculated by summing up their distances
605   * to their closest common ancestor.
606   * @param node1 one node
607   * @param node2 another node
608   * @return the distance between node1 and node2 which is zero if they are the same
609   *  or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster
610   */
611  public int getDistance(Node node1, Node node2) {
612    if (node1 == node2) {
613      return 0;
614    }
615    Node n1=node1, n2=node2;
616    int dis = 0;
617    netlock.readLock().lock();
618    try {
619      int level1=node1.getLevel(), level2=node2.getLevel();
620      while(n1!=null && level1>level2) {
621        n1 = n1.getParent();
622        level1--;
623        dis++;
624      }
625      while(n2!=null && level2>level1) {
626        n2 = n2.getParent();
627        level2--;
628        dis++;
629      }
630      while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) {
631        n1=n1.getParent();
632        n2=n2.getParent();
633        dis+=2;
634      }
635    } finally {
636      netlock.readLock().unlock();
637    }
638    if (n1==null) {
639      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1));
640      return Integer.MAX_VALUE;
641    }
642    if (n2==null) {
643      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2));
644      return Integer.MAX_VALUE;
645    }
646    return dis+2;
647  }
648
649  /** Check if two nodes are on the same rack
650   * @param node1 one node (can be null)
651   * @param node2 another node (can be null)
652   * @return true if node1 and node2 are on the same rack; false otherwise
653   * @exception IllegalArgumentException when either node1 or node2 is null, or
654   * node1 or node2 do not belong to the cluster
655   */
656  public boolean isOnSameRack( Node node1,  Node node2) {
657    if (node1 == null || node2 == null) {
658      return false;
659    }
660      
661    netlock.readLock().lock();
662    try {
663      return isSameParents(node1, node2);
664    } finally {
665      netlock.readLock().unlock();
666    }
667  }
668  
669  /**
670   * Check if network topology is aware of NodeGroup
671   */
672  public boolean isNodeGroupAware() {
673    return false;
674  }
675  
676  /** 
677   * Return false directly as not aware of NodeGroup, to be override in sub-class
678   */
679  public boolean isOnSameNodeGroup(Node node1, Node node2) {
680    return false;
681  }
682
683  /**
684   * Compare the parents of each node for equality
685   * 
686   * <p>To be overridden in subclasses for specific NetworkTopology 
687   * implementations, as alternative to overriding the full 
688   * {@link #isOnSameRack(Node, Node)} method.
689   * 
690   * @param node1 the first node to compare
691   * @param node2 the second node to compare
692   * @return true if their parents are equal, false otherwise
693   * 
694   * @see #isOnSameRack(Node, Node)
695   */
696  protected boolean isSameParents(Node node1, Node node2) {
697    return node1.getParent()==node2.getParent();
698  }
699
700  private static final Random r = new Random();
701
702  @VisibleForTesting
703  void setRandomSeed(long seed) {
704    r.setSeed(seed);
705  }
706
707  /** randomly choose one node from <i>scope</i>
708   * if scope starts with ~, choose one from the all nodes except for the
709   * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
710   * @param scope range of nodes from which a node will be chosen
711   * @return the chosen node
712   */
713  public Node chooseRandom(String scope) {
714    netlock.readLock().lock();
715    try {
716      if (scope.startsWith("~")) {
717        return chooseRandom(NodeBase.ROOT, scope.substring(1));
718      } else {
719        return chooseRandom(scope, null);
720      }
721    } finally {
722      netlock.readLock().unlock();
723    }
724  }
725
726  private Node chooseRandom(String scope, String excludedScope){
727    if (excludedScope != null) {
728      if (scope.startsWith(excludedScope)) {
729        return null;
730      }
731      if (!excludedScope.startsWith(scope)) {
732        excludedScope = null;
733      }
734    }
735    Node node = getNode(scope);
736    if (!(node instanceof InnerNode)) {
737      return node;
738    }
739    InnerNode innerNode = (InnerNode)node;
740    int numOfDatanodes = innerNode.getNumOfLeaves();
741    if (excludedScope == null) {
742      node = null;
743    } else {
744      node = getNode(excludedScope);
745      if (!(node instanceof InnerNode)) {
746        numOfDatanodes -= 1;
747      } else {
748        numOfDatanodes -= ((InnerNode)node).getNumOfLeaves();
749      }
750    }
751    if (numOfDatanodes == 0) {
752      throw new InvalidTopologyException(
753          "Failed to find datanode (scope=\"" + String.valueOf(scope) +
754          "\" excludedScope=\"" + String.valueOf(excludedScope) + "\").");
755    }
756    int leaveIndex = r.nextInt(numOfDatanodes);
757    return innerNode.getLeaf(leaveIndex, node);
758  }
759
760  /** return leaves in <i>scope</i>
761   * @param scope a path string
762   * @return leaves nodes under specific scope
763   */
764  public List<Node> getLeaves(String scope) {
765    Node node = getNode(scope);
766    List<Node> leafNodes = new ArrayList<Node>();
767    if (!(node instanceof InnerNode)) {
768      leafNodes.add(node);
769    } else {
770      InnerNode innerNode = (InnerNode) node;
771      for (int i=0;i<innerNode.getNumOfLeaves();i++) {
772        leafNodes.add(innerNode.getLeaf(i, null));
773      }
774    }
775    return leafNodes;
776  }
777
778  /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i>
779   * if scope starts with ~, return the number of nodes that are not
780   * in <i>scope</i> and <i>excludedNodes</i>; 
781   * @param scope a path string that may start with ~
782   * @param excludedNodes a list of nodes
783   * @return number of available nodes
784   */
785  public int countNumOfAvailableNodes(String scope,
786                                      Collection<Node> excludedNodes) {
787    boolean isExcluded=false;
788    if (scope.startsWith("~")) {
789      isExcluded=true;
790      scope=scope.substring(1);
791    }
792    scope = NodeBase.normalize(scope);
793    int excludedCountInScope = 0; // the number of nodes in both scope & excludedNodes
794    int excludedCountOffScope = 0; // the number of nodes outside scope & excludedNodes
795    netlock.readLock().lock();
796    try {
797      for (Node node : excludedNodes) {
798        node = getNode(NodeBase.getPath(node));
799        if (node == null) {
800          continue;
801        }
802        if ((NodeBase.getPath(node) + NodeBase.PATH_SEPARATOR_STR)
803            .startsWith(scope + NodeBase.PATH_SEPARATOR_STR)) {
804          excludedCountInScope++;
805        } else {
806          excludedCountOffScope++;
807        }
808      }
809      Node n = getNode(scope);
810      int scopeNodeCount = 0;
811      if (n != null) {
812        scopeNodeCount++;
813      }
814      if (n instanceof InnerNode) {
815        scopeNodeCount=((InnerNode)n).getNumOfLeaves();
816      }
817      if (isExcluded) {
818        return clusterMap.getNumOfLeaves() - scopeNodeCount
819            - excludedCountOffScope;
820      } else {
821        return scopeNodeCount - excludedCountInScope;
822      }
823    } finally {
824      netlock.readLock().unlock();
825    }
826  }
827
828  /** convert a network tree to a string */
829  @Override
830  public String toString() {
831    // print the number of racks
832    StringBuilder tree = new StringBuilder();
833    tree.append("Number of racks: ");
834    tree.append(numOfRacks);
835    tree.append("\n");
836    // print the number of leaves
837    int numOfLeaves = getNumOfLeaves();
838    tree.append("Expected number of leaves:");
839    tree.append(numOfLeaves);
840    tree.append("\n");
841    // print nodes
842    for(int i=0; i<numOfLeaves; i++) {
843      tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null)));
844      tree.append("\n");
845    }
846    return tree.toString();
847  }
848  
849  /**
850   * Divide networklocation string into two parts by last separator, and get 
851   * the first part here.
852   * 
853   * @param networkLocation
854   * @return
855   */
856  public static String getFirstHalf(String networkLocation) {
857    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
858    return networkLocation.substring(0, index);
859  }
860
861  /**
862   * Divide networklocation string into two parts by last separator, and get 
863   * the second part here.
864   * 
865   * @param networkLocation
866   * @return
867   */
868  public static String getLastHalf(String networkLocation) {
869    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
870    return networkLocation.substring(index);
871  }
872
873  /**
874   * Returns an integer weight which specifies how far away {node} is away from
875   * {reader}. A lower value signifies that a node is closer.
876   * 
877   * @param reader Node where data will be read
878   * @param node Replica of data
879   * @return weight
880   */
881  protected int getWeight(Node reader, Node node) {
882    // 0 is local, 1 is same rack, 2 is off rack
883    // Start off by initializing to off rack
884    int weight = 2;
885    if (reader != null) {
886      if (reader.equals(node)) {
887        weight = 0;
888      } else if (isOnSameRack(reader, node)) {
889        weight = 1;
890      }
891    }
892    return weight;
893  }
894
895  /**
896   * Sort nodes array by network distance to <i>reader</i>.
897   * <p/>
898   * In a three-level topology, a node can be either local, on the same rack,
899   * or on a different rack from the reader. Sorting the nodes based on network
900   * distance from the reader reduces network traffic and improves
901   * performance.
902   * <p/>
903   * As an additional twist, we also randomize the nodes at each network
904   * distance. This helps with load balancing when there is data skew.
905   *
906   * @param reader    Node where data will be read
907   * @param nodes     Available replicas with the requested data
908   * @param activeLen Number of active nodes at the front of the array
909   */
910  public void sortByDistance(Node reader, Node[] nodes, int activeLen) {
911    /** Sort weights for the nodes array */
912    int[] weights = new int[activeLen];
913    for (int i=0; i<activeLen; i++) {
914      weights[i] = getWeight(reader, nodes[i]);
915    }
916    // Add weight/node pairs to a TreeMap to sort
917    TreeMap<Integer, List<Node>> tree = new TreeMap<Integer, List<Node>>();
918    for (int i=0; i<activeLen; i++) {
919      int weight = weights[i];
920      Node node = nodes[i];
921      List<Node> list = tree.get(weight);
922      if (list == null) {
923        list = Lists.newArrayListWithExpectedSize(1);
924        tree.put(weight, list);
925      }
926      list.add(node);
927    }
928
929    int idx = 0;
930    for (List<Node> list: tree.values()) {
931      if (list != null) {
932        Collections.shuffle(list, r);
933        for (Node n: list) {
934          nodes[idx] = n;
935          idx++;
936        }
937      }
938    }
939    Preconditions.checkState(idx == activeLen,
940        "Sorted the wrong number of nodes!");
941  }
942}