001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.net;
019
020import java.util.ArrayList;
021import java.util.List;
022import java.util.Collection;
023import java.util.Collections;
024import java.util.List;
025import java.util.Random;
026import java.util.TreeMap;
027import java.util.concurrent.locks.ReadWriteLock;
028import java.util.concurrent.locks.ReentrantReadWriteLock;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.classification.InterfaceStability;
034import org.apache.hadoop.conf.Configuration;
035import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
036import org.apache.hadoop.util.ReflectionUtils;
037
038import com.google.common.base.Preconditions;
039import com.google.common.collect.Lists;
040
041/** The class represents a cluster of computer with a tree hierarchical
042 * network topology.
043 * For example, a cluster may be consists of many data centers filled 
044 * with racks of computers.
045 * In a network topology, leaves represent data nodes (computers) and inner
046 * nodes represent switches/routers that manage traffic in/out of data centers
047 * or racks.  
048 * 
049 */
050@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
051@InterfaceStability.Unstable
052public class NetworkTopology {
053  public final static String DEFAULT_RACK = "/default-rack";
054  public final static int DEFAULT_HOST_LEVEL = 2;
055  public static final Log LOG = 
056    LogFactory.getLog(NetworkTopology.class);
057    
058  public static class InvalidTopologyException extends RuntimeException {
059    private static final long serialVersionUID = 1L;
060    public InvalidTopologyException(String msg) {
061      super(msg);
062    }
063  }
064  
065  /**
066   * Get an instance of NetworkTopology based on the value of the configuration
067   * parameter net.topology.impl.
068   * 
069   * @param conf the configuration to be used
070   * @return an instance of NetworkTopology
071   */
072  public static NetworkTopology getInstance(Configuration conf){
073    return ReflectionUtils.newInstance(
074        conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY,
075        NetworkTopology.class, NetworkTopology.class), conf);
076  }
077
078  /** InnerNode represents a switch/router of a data center or rack.
079   * Different from a leaf node, it has non-null children.
080   */
081  static class InnerNode extends NodeBase {
082    protected List<Node> children=new ArrayList<Node>();
083    private int numOfLeaves;
084        
085    /** Construct an InnerNode from a path-like string */
086    InnerNode(String path) {
087      super(path);
088    }
089        
090    /** Construct an InnerNode from its name and its network location */
091    InnerNode(String name, String location) {
092      super(name, location);
093    }
094        
095    /** Construct an InnerNode
096     * from its name, its network location, its parent, and its level */
097    InnerNode(String name, String location, InnerNode parent, int level) {
098      super(name, location, parent, level);
099    }
100        
101    /** @return its children */
102    List<Node> getChildren() {return children;}
103        
104    /** @return the number of children this node has */
105    int getNumOfChildren() {
106      return children.size();
107    }
108        
109    /** Judge if this node represents a rack 
110     * @return true if it has no child or its children are not InnerNodes
111     */ 
112    boolean isRack() {
113      if (children.isEmpty()) {
114        return true;
115      }
116            
117      Node firstChild = children.get(0);
118      if (firstChild instanceof InnerNode) {
119        return false;
120      }
121            
122      return true;
123    }
124        
125    /** Judge if this node is an ancestor of node <i>n</i>
126     * 
127     * @param n a node
128     * @return true if this node is an ancestor of <i>n</i>
129     */
130    boolean isAncestor(Node n) {
131      return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) ||
132        (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR).
133        startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR);
134    }
135        
136    /** Judge if this node is the parent of node <i>n</i>
137     * 
138     * @param n a node
139     * @return true if this node is the parent of <i>n</i>
140     */
141    boolean isParent(Node n) {
142      return n.getNetworkLocation().equals(getPath(this));
143    }
144        
145    /* Return a child name of this node who is an ancestor of node <i>n</i> */
146    private String getNextAncestorName(Node n) {
147      if (!isAncestor(n)) {
148        throw new IllegalArgumentException(
149                                           this + "is not an ancestor of " + n);
150      }
151      String name = n.getNetworkLocation().substring(getPath(this).length());
152      if (name.charAt(0) == PATH_SEPARATOR) {
153        name = name.substring(1);
154      }
155      int index=name.indexOf(PATH_SEPARATOR);
156      if (index !=-1)
157        name = name.substring(0, index);
158      return name;
159    }
160        
161    /** Add node <i>n</i> to the subtree of this node 
162     * @param n node to be added
163     * @return true if the node is added; false otherwise
164     */
165    boolean add(Node n) {
166      if (!isAncestor(n))
167        throw new IllegalArgumentException(n.getName()+", which is located at "
168                +n.getNetworkLocation()+", is not a decendent of "
169                +getPath(this));
170      if (isParent(n)) {
171        // this node is the parent of n; add n directly
172        n.setParent(this);
173        n.setLevel(this.level+1);
174        for(int i=0; i<children.size(); i++) {
175          if (children.get(i).getName().equals(n.getName())) {
176            children.set(i, n);
177            return false;
178          }
179        }
180        children.add(n);
181        numOfLeaves++;
182        return true;
183      } else {
184        // find the next ancestor node
185        String parentName = getNextAncestorName(n);
186        InnerNode parentNode = null;
187        for(int i=0; i<children.size(); i++) {
188          if (children.get(i).getName().equals(parentName)) {
189            parentNode = (InnerNode)children.get(i);
190            break;
191          }
192        }
193        if (parentNode == null) {
194          // create a new InnerNode
195          parentNode = createParentNode(parentName);
196          children.add(parentNode);
197        }
198        // add n to the subtree of the next ancestor node
199        if (parentNode.add(n)) {
200          numOfLeaves++;
201          return true;
202        } else {
203          return false;
204        }
205      }
206    }
207
208    /**
209     * Creates a parent node to be added to the list of children.  
210     * Creates a node using the InnerNode four argument constructor specifying 
211     * the name, location, parent, and level of this node.
212     * 
213     * <p>To be overridden in subclasses for specific InnerNode implementations,
214     * as alternative to overriding the full {@link #add(Node)} method.
215     * 
216     * @param parentName The name of the parent node
217     * @return A new inner node
218     * @see InnerNode#InnerNode(String, String, InnerNode, int)
219     */
220    protected InnerNode createParentNode(String parentName) {
221      return new InnerNode(parentName, getPath(this), this, this.getLevel()+1);
222    }
223
224    /** Remove node <i>n</i> from the subtree of this node
225     * @param n node to be deleted 
226     * @return true if the node is deleted; false otherwise
227     */
228    boolean remove(Node n) {
229      String parent = n.getNetworkLocation();
230      String currentPath = getPath(this);
231      if (!isAncestor(n))
232        throw new IllegalArgumentException(n.getName()
233                                           +", which is located at "
234                                           +parent+", is not a descendent of "+currentPath);
235      if (isParent(n)) {
236        // this node is the parent of n; remove n directly
237        for(int i=0; i<children.size(); i++) {
238          if (children.get(i).getName().equals(n.getName())) {
239            children.remove(i);
240            numOfLeaves--;
241            n.setParent(null);
242            return true;
243          }
244        }
245        return false;
246      } else {
247        // find the next ancestor node: the parent node
248        String parentName = getNextAncestorName(n);
249        InnerNode parentNode = null;
250        int i;
251        for(i=0; i<children.size(); i++) {
252          if (children.get(i).getName().equals(parentName)) {
253            parentNode = (InnerNode)children.get(i);
254            break;
255          }
256        }
257        if (parentNode==null) {
258          return false;
259        }
260        // remove n from the parent node
261        boolean isRemoved = parentNode.remove(n);
262        // if the parent node has no children, remove the parent node too
263        if (isRemoved) {
264          if (parentNode.getNumOfChildren() == 0) {
265            children.remove(i);
266          }
267          numOfLeaves--;
268        }
269        return isRemoved;
270      }
271    } // end of remove
272        
273    /** Given a node's string representation, return a reference to the node
274     * @param loc string location of the form /rack/node
275     * @return null if the node is not found or the childnode is there but
276     * not an instance of {@link InnerNode}
277     */
278    private Node getLoc(String loc) {
279      if (loc == null || loc.length() == 0) return this;
280            
281      String[] path = loc.split(PATH_SEPARATOR_STR, 2);
282      Node childnode = null;
283      for(int i=0; i<children.size(); i++) {
284        if (children.get(i).getName().equals(path[0])) {
285          childnode = children.get(i);
286        }
287      }
288      if (childnode == null) return null; // non-existing node
289      if (path.length == 1) return childnode;
290      if (childnode instanceof InnerNode) {
291        return ((InnerNode)childnode).getLoc(path[1]);
292      } else {
293        return null;
294      }
295    }
296        
297    /** get <i>leafIndex</i> leaf of this subtree 
298     * if it is not in the <i>excludedNode</i>
299     *
300     * @param leafIndex an indexed leaf of the node
301     * @param excludedNode an excluded node (can be null)
302     * @return
303     */
304    Node getLeaf(int leafIndex, Node excludedNode) {
305      int count=0;
306      // check if the excluded node a leaf
307      boolean isLeaf =
308        excludedNode == null || !(excludedNode instanceof InnerNode);
309      // calculate the total number of excluded leaf nodes
310      int numOfExcludedLeaves =
311        isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves();
312      if (isLeafParent()) { // children are leaves
313        if (isLeaf) { // excluded node is a leaf node
314          int excludedIndex = children.indexOf(excludedNode);
315          if (excludedIndex != -1 && leafIndex >= 0) {
316            // excluded node is one of the children so adjust the leaf index
317            leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex;
318          }
319        }
320        // range check
321        if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) {
322          return null;
323        }
324        return children.get(leafIndex);
325      } else {
326        for(int i=0; i<children.size(); i++) {
327          InnerNode child = (InnerNode)children.get(i);
328          if (excludedNode == null || excludedNode != child) {
329            // not the excludedNode
330            int numOfLeaves = child.getNumOfLeaves();
331            if (excludedNode != null && child.isAncestor(excludedNode)) {
332              numOfLeaves -= numOfExcludedLeaves;
333            }
334            if (count+numOfLeaves > leafIndex) {
335              // the leaf is in the child subtree
336              return child.getLeaf(leafIndex-count, excludedNode);
337            } else {
338              // go to the next child
339              count = count+numOfLeaves;
340            }
341          } else { // it is the excluededNode
342            // skip it and set the excludedNode to be null
343            excludedNode = null;
344          }
345        }
346        return null;
347      }
348    }
349    
350    protected boolean isLeafParent() {
351      return isRack();
352    }
353
354    /**
355      * Determine if children a leaves, default implementation calls {@link #isRack()}
356      * <p>To be overridden in subclasses for specific InnerNode implementations,
357      * as alternative to overriding the full {@link #getLeaf(int, Node)} method.
358      * 
359      * @return true if children are leaves, false otherwise
360      */
361    protected boolean areChildrenLeaves() {
362      return isRack();
363    }
364
365    /**
366     * Get number of leaves.
367     */
368    int getNumOfLeaves() {
369      return numOfLeaves;
370    }
371  } // end of InnerNode
372
373  /**
374   * the root cluster map
375   */
376  InnerNode clusterMap;
377  /** Depth of all leaf nodes */
378  private int depthOfAllLeaves = -1;
379  /** rack counter */
380  protected int numOfRacks = 0;
381  /** the lock used to manage access */
382  protected ReadWriteLock netlock = new ReentrantReadWriteLock();
383
384  public NetworkTopology() {
385    clusterMap = new InnerNode(InnerNode.ROOT);
386  }
387
388  /** Add a leaf node
389   * Update node counter & rack counter if necessary
390   * @param node node to be added; can be null
391   * @exception IllegalArgumentException if add a node to a leave 
392                                         or node to be added is not a leaf
393   */
394  public void add(Node node) {
395    if (node==null) return;
396    String oldTopoStr = this.toString();
397    if( node instanceof InnerNode ) {
398      throw new IllegalArgumentException(
399        "Not allow to add an inner node: "+NodeBase.getPath(node));
400    }
401    int newDepth = NodeBase.locationToDepth(node.getNetworkLocation()) + 1;
402    netlock.writeLock().lock();
403    try {
404      if ((depthOfAllLeaves != -1) && (depthOfAllLeaves != newDepth)) {
405        LOG.error("Error: can't add leaf node " + NodeBase.getPath(node) +
406            " at depth " + newDepth + " to topology:\n" + oldTopoStr);
407        throw new InvalidTopologyException("Failed to add " + NodeBase.getPath(node) +
408            ": You cannot have a rack and a non-rack node at the same " +
409            "level of the network topology.");
410      }
411      Node rack = getNodeForNetworkLocation(node);
412      if (rack != null && !(rack instanceof InnerNode)) {
413        throw new IllegalArgumentException("Unexpected data node " 
414                                           + node.toString() 
415                                           + " at an illegal network location");
416      }
417      if (clusterMap.add(node)) {
418        LOG.info("Adding a new node: "+NodeBase.getPath(node));
419        if (rack == null) {
420          numOfRacks++;
421        }
422        if (!(node instanceof InnerNode)) {
423          if (depthOfAllLeaves == -1) {
424            depthOfAllLeaves = node.getLevel();
425          }
426        }
427      }
428      if(LOG.isDebugEnabled()) {
429        LOG.debug("NetworkTopology became:\n" + this.toString());
430      }
431    } finally {
432      netlock.writeLock().unlock();
433    }
434  }
435  
436  /**
437   * Return a reference to the node given its string representation.
438   * Default implementation delegates to {@link #getNode(String)}.
439   * 
440   * <p>To be overridden in subclasses for specific NetworkTopology 
441   * implementations, as alternative to overriding the full {@link #add(Node)}
442   *  method.
443   * 
444   * @param node The string representation of this node's network location is
445   * used to retrieve a Node object. 
446   * @return a reference to the node; null if the node is not in the tree
447   * 
448   * @see #add(Node)
449   * @see #getNode(String)
450   */
451  protected Node getNodeForNetworkLocation(Node node) {
452    return getNode(node.getNetworkLocation());
453  }
454  
455  /**
456   * Given a string representation of a rack, return its children
457   * @param loc a path-like string representation of a rack
458   * @return a newly allocated list with all the node's children
459   */
460  public List<Node> getDatanodesInRack(String loc) {
461    netlock.readLock().lock();
462    try {
463      loc = NodeBase.normalize(loc);
464      if (!NodeBase.ROOT.equals(loc)) {
465        loc = loc.substring(1);
466      }
467      InnerNode rack = (InnerNode) clusterMap.getLoc(loc);
468      if (rack == null) {
469        return null;
470      }
471      return new ArrayList<Node>(rack.getChildren());
472    } finally {
473      netlock.readLock().unlock();
474    }
475  }
476
477  /** Remove a node
478   * Update node counter and rack counter if necessary
479   * @param node node to be removed; can be null
480   */ 
481  public void remove(Node node) {
482    if (node==null) return;
483    if( node instanceof InnerNode ) {
484      throw new IllegalArgumentException(
485        "Not allow to remove an inner node: "+NodeBase.getPath(node));
486    }
487    LOG.info("Removing a node: "+NodeBase.getPath(node));
488    netlock.writeLock().lock();
489    try {
490      if (clusterMap.remove(node)) {
491        InnerNode rack = (InnerNode)getNode(node.getNetworkLocation());
492        if (rack == null) {
493          numOfRacks--;
494        }
495      }
496      if(LOG.isDebugEnabled()) {
497        LOG.debug("NetworkTopology became:\n" + this.toString());
498      }
499    } finally {
500      netlock.writeLock().unlock();
501    }
502  }
503
504  /** Check if the tree contains node <i>node</i>
505   * 
506   * @param node a node
507   * @return true if <i>node</i> is already in the tree; false otherwise
508   */
509  public boolean contains(Node node) {
510    if (node == null) return false;
511    netlock.readLock().lock();
512    try {
513      Node parent = node.getParent();
514      for (int level = node.getLevel(); parent != null && level > 0;
515           parent = parent.getParent(), level--) {
516        if (parent == clusterMap) {
517          return true;
518        }
519      }
520    } finally {
521      netlock.readLock().unlock();
522    }
523    return false; 
524  }
525    
526  /** Given a string representation of a node, return its reference
527   * 
528   * @param loc
529   *          a path-like string representation of a node
530   * @return a reference to the node; null if the node is not in the tree
531   */
532  public Node getNode(String loc) {
533    netlock.readLock().lock();
534    try {
535      loc = NodeBase.normalize(loc);
536      if (!NodeBase.ROOT.equals(loc))
537        loc = loc.substring(1);
538      return clusterMap.getLoc(loc);
539    } finally {
540      netlock.readLock().unlock();
541    }
542  }
543  
544  /** Given a string representation of a rack for a specific network
545   *  location
546   * 
547   * To be overridden in subclasses for specific NetworkTopology 
548   * implementations, as alternative to overriding the full 
549   * {@link #getRack(String)} method.
550   * @param loc
551   *          a path-like string representation of a network location
552   * @return a rack string
553   */
554  public String getRack(String loc) {
555    return loc;
556  }
557  
558  /** @return the total number of racks */
559  public int getNumOfRacks() {
560    netlock.readLock().lock();
561    try {
562      return numOfRacks;
563    } finally {
564      netlock.readLock().unlock();
565    }
566  }
567
568  /** @return the total number of leaf nodes */
569  public int getNumOfLeaves() {
570    netlock.readLock().lock();
571    try {
572      return clusterMap.getNumOfLeaves();
573    } finally {
574      netlock.readLock().unlock();
575    }
576  }
577
578  /** Return the distance between two nodes
579   * It is assumed that the distance from one node to its parent is 1
580   * The distance between two nodes is calculated by summing up their distances
581   * to their closest common ancestor.
582   * @param node1 one node
583   * @param node2 another node
584   * @return the distance between node1 and node2 which is zero if they are the same
585   *  or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster
586   */
587  public int getDistance(Node node1, Node node2) {
588    if (node1 == node2) {
589      return 0;
590    }
591    Node n1=node1, n2=node2;
592    int dis = 0;
593    netlock.readLock().lock();
594    try {
595      int level1=node1.getLevel(), level2=node2.getLevel();
596      while(n1!=null && level1>level2) {
597        n1 = n1.getParent();
598        level1--;
599        dis++;
600      }
601      while(n2!=null && level2>level1) {
602        n2 = n2.getParent();
603        level2--;
604        dis++;
605      }
606      while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) {
607        n1=n1.getParent();
608        n2=n2.getParent();
609        dis+=2;
610      }
611    } finally {
612      netlock.readLock().unlock();
613    }
614    if (n1==null) {
615      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1));
616      return Integer.MAX_VALUE;
617    }
618    if (n2==null) {
619      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2));
620      return Integer.MAX_VALUE;
621    }
622    return dis+2;
623  }
624
625  /** Check if two nodes are on the same rack
626   * @param node1 one node (can be null)
627   * @param node2 another node (can be null)
628   * @return true if node1 and node2 are on the same rack; false otherwise
629   * @exception IllegalArgumentException when either node1 or node2 is null, or
630   * node1 or node2 do not belong to the cluster
631   */
632  public boolean isOnSameRack( Node node1,  Node node2) {
633    if (node1 == null || node2 == null) {
634      return false;
635    }
636      
637    netlock.readLock().lock();
638    try {
639      return isSameParents(node1, node2);
640    } finally {
641      netlock.readLock().unlock();
642    }
643  }
644  
645  /**
646   * Check if network topology is aware of NodeGroup
647   */
648  public boolean isNodeGroupAware() {
649    return false;
650  }
651  
652  /** 
653   * Return false directly as not aware of NodeGroup, to be override in sub-class
654   */
655  public boolean isOnSameNodeGroup(Node node1, Node node2) {
656    return false;
657  }
658
659  /**
660   * Compare the parents of each node for equality
661   * 
662   * <p>To be overridden in subclasses for specific NetworkTopology 
663   * implementations, as alternative to overriding the full 
664   * {@link #isOnSameRack(Node, Node)} method.
665   * 
666   * @param node1 the first node to compare
667   * @param node2 the second node to compare
668   * @return true if their parents are equal, false otherwise
669   * 
670   * @see #isOnSameRack(Node, Node)
671   */
672  protected boolean isSameParents(Node node1, Node node2) {
673    return node1.getParent()==node2.getParent();
674  }
675
676  private static final ThreadLocal<Random> r = new ThreadLocal<Random>();
677
678  /**
679   * Getter for thread-local Random, which provides better performance than
680   * a shared Random (even though Random is thread-safe).
681   *
682   * @return Thread-local Random.
683   */
684  protected Random getRandom() {
685    Random rand = r.get();
686    if (rand == null) {
687      rand = new Random();
688      r.set(rand);
689    }
690    return rand;
691  }
692
693  /** randomly choose one node from <i>scope</i>
694   * if scope starts with ~, choose one from the all nodes except for the
695   * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
696   * @param scope range of nodes from which a node will be chosen
697   * @return the chosen node
698   */
699  public Node chooseRandom(String scope) {
700    netlock.readLock().lock();
701    try {
702      if (scope.startsWith("~")) {
703        return chooseRandom(NodeBase.ROOT, scope.substring(1));
704      } else {
705        return chooseRandom(scope, null);
706      }
707    } finally {
708      netlock.readLock().unlock();
709    }
710  }
711
712  private Node chooseRandom(String scope, String excludedScope){
713    if (excludedScope != null) {
714      if (scope.startsWith(excludedScope)) {
715        return null;
716      }
717      if (!excludedScope.startsWith(scope)) {
718        excludedScope = null;
719      }
720    }
721    Node node = getNode(scope);
722    if (!(node instanceof InnerNode)) {
723      return node;
724    }
725    InnerNode innerNode = (InnerNode)node;
726    int numOfDatanodes = innerNode.getNumOfLeaves();
727    if (excludedScope == null) {
728      node = null;
729    } else {
730      node = getNode(excludedScope);
731      if (!(node instanceof InnerNode)) {
732        numOfDatanodes -= 1;
733      } else {
734        numOfDatanodes -= ((InnerNode)node).getNumOfLeaves();
735      }
736    }
737    if (numOfDatanodes == 0) {
738      throw new InvalidTopologyException(
739          "Failed to find datanode (scope=\"" + String.valueOf(scope) +
740          "\" excludedScope=\"" + String.valueOf(excludedScope) + "\").");
741    }
742    int leaveIndex = getRandom().nextInt(numOfDatanodes);
743    return innerNode.getLeaf(leaveIndex, node);
744  }
745
746  /** return leaves in <i>scope</i>
747   * @param scope a path string
748   * @return leaves nodes under specific scope
749   */
750  public List<Node> getLeaves(String scope) {
751    Node node = getNode(scope);
752    List<Node> leafNodes = new ArrayList<Node>();
753    if (!(node instanceof InnerNode)) {
754      leafNodes.add(node);
755    } else {
756      InnerNode innerNode = (InnerNode) node;
757      for (int i=0;i<innerNode.getNumOfLeaves();i++) {
758        leafNodes.add(innerNode.getLeaf(i, null));
759      }
760    }
761    return leafNodes;
762  }
763
764  /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i>
765   * if scope starts with ~, return the number of nodes that are not
766   * in <i>scope</i> and <i>excludedNodes</i>; 
767   * @param scope a path string that may start with ~
768   * @param excludedNodes a list of nodes
769   * @return number of available nodes
770   */
771  public int countNumOfAvailableNodes(String scope,
772                                      Collection<Node> excludedNodes) {
773    boolean isExcluded=false;
774    if (scope.startsWith("~")) {
775      isExcluded=true;
776      scope=scope.substring(1);
777    }
778    scope = NodeBase.normalize(scope);
779    int count=0; // the number of nodes in both scope & excludedNodes
780    netlock.readLock().lock();
781    try {
782      for(Node node:excludedNodes) {
783        if ((NodeBase.getPath(node)+NodeBase.PATH_SEPARATOR_STR).
784            startsWith(scope+NodeBase.PATH_SEPARATOR_STR)) {
785          count++;
786        }
787      }
788      Node n=getNode(scope);
789      int scopeNodeCount=1;
790      if (n instanceof InnerNode) {
791        scopeNodeCount=((InnerNode)n).getNumOfLeaves();
792      }
793      if (isExcluded) {
794        return clusterMap.getNumOfLeaves()-
795          scopeNodeCount-excludedNodes.size()+count;
796      } else {
797        return scopeNodeCount-count;
798      }
799    } finally {
800      netlock.readLock().unlock();
801    }
802  }
803
804  /** convert a network tree to a string */
805  @Override
806  public String toString() {
807    // print the number of racks
808    StringBuilder tree = new StringBuilder();
809    tree.append("Number of racks: ");
810    tree.append(numOfRacks);
811    tree.append("\n");
812    // print the number of leaves
813    int numOfLeaves = getNumOfLeaves();
814    tree.append("Expected number of leaves:");
815    tree.append(numOfLeaves);
816    tree.append("\n");
817    // print nodes
818    for(int i=0; i<numOfLeaves; i++) {
819      tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null)));
820      tree.append("\n");
821    }
822    return tree.toString();
823  }
824  
825  /**
826   * Divide networklocation string into two parts by last separator, and get 
827   * the first part here.
828   * 
829   * @param networkLocation
830   * @return
831   */
832  public static String getFirstHalf(String networkLocation) {
833    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
834    return networkLocation.substring(0, index);
835  }
836
837  /**
838   * Divide networklocation string into two parts by last separator, and get 
839   * the second part here.
840   * 
841   * @param networkLocation
842   * @return
843   */
844  public static String getLastHalf(String networkLocation) {
845    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
846    return networkLocation.substring(index);
847  }
848
849  /**
850   * Returns an integer weight which specifies how far away {node} is away from
851   * {reader}. A lower value signifies that a node is closer.
852   * 
853   * @param reader Node where data will be read
854   * @param node Replica of data
855   * @return weight
856   */
857  protected int getWeight(Node reader, Node node) {
858    // 0 is local, 1 is same rack, 2 is off rack
859    // Start off by initializing to off rack
860    int weight = 2;
861    if (reader != null) {
862      if (reader == node) {
863        weight = 0;
864      } else if (isOnSameRack(reader, node)) {
865        weight = 1;
866      }
867    }
868    return weight;
869  }
870
871  /**
872   * Sort nodes array by network distance to <i>reader</i>.
873   * <p/>
874   * In a three-level topology, a node can be either local, on the same rack, or
875   * on a different rack from the reader. Sorting the nodes based on network
876   * distance from the reader reduces network traffic and improves performance.
877   * <p/>
878   * As an additional twist, we also randomize the nodes at each network
879   * distance using the provided random seed. This helps with load balancing
880   * when there is data skew.
881   * 
882   * @param reader Node where data will be read
883   * @param nodes Available replicas with the requested data
884   * @param seed Used to seed the pseudo-random generator that randomizes the
885   *          set of nodes at each network distance.
886   */
887  public void sortByDistance(Node reader, Node[] nodes,
888      int activeLen, long seed) {
889    /** Sort weights for the nodes array */
890    int[] weights = new int[activeLen];
891    for (int i=0; i<activeLen; i++) {
892      weights[i] = getWeight(reader, nodes[i]);
893    }
894    // Add weight/node pairs to a TreeMap to sort
895    TreeMap<Integer, List<Node>> tree = new TreeMap<Integer, List<Node>>();
896    for (int i=0; i<activeLen; i++) {
897      int weight = weights[i];
898      Node node = nodes[i];
899      List<Node> list = tree.get(weight);
900      if (list == null) {
901        list = Lists.newArrayListWithExpectedSize(1);
902        tree.put(weight, list);
903      }
904      list.add(node);
905    }
906
907    // Seed is normally the block id
908    // This means we use the same pseudo-random order for each block, for
909    // potentially better page cache usage.
910    Random rand = getRandom();
911    rand.setSeed(seed);
912    int idx = 0;
913    for (List<Node> list: tree.values()) {
914      if (list != null) {
915        Collections.shuffle(list, rand);
916        for (Node n: list) {
917          nodes[idx] = n;
918          idx++;
919        }
920      }
921    }
922    Preconditions.checkState(idx == activeLen,
923        "Sorted the wrong number of nodes!");
924  }
925}