001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.net;
019
020import java.util.ArrayList;
021import java.util.Collection;
022import java.util.Random;
023import java.util.concurrent.locks.ReadWriteLock;
024import java.util.concurrent.locks.ReentrantReadWriteLock;
025
026import org.apache.commons.logging.Log;
027import org.apache.commons.logging.LogFactory;
028import org.apache.hadoop.classification.InterfaceAudience;
029import org.apache.hadoop.classification.InterfaceStability;
030
031/** The class represents a cluster of computer with a tree hierarchical
032 * network topology.
033 * For example, a cluster may be consists of many data centers filled 
034 * with racks of computers.
035 * In a network topology, leaves represent data nodes (computers) and inner
036 * nodes represent switches/routers that manage traffic in/out of data centers
037 * or racks.  
038 * 
039 */
040@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
041@InterfaceStability.Unstable
042public class NetworkTopology {
043  public final static String DEFAULT_RACK = "/default-rack";
044  public final static int DEFAULT_HOST_LEVEL = 2;
045  public static final Log LOG = 
046    LogFactory.getLog(NetworkTopology.class);
047    
048  public static class InvalidTopologyException extends RuntimeException {
049    private static final long serialVersionUID = 1L;
050    public InvalidTopologyException(String msg) {
051      super(msg);
052    }
053  }
054
055  /** InnerNode represents a switch/router of a data center or rack.
056   * Different from a leaf node, it has non-null children.
057   */
058  private class InnerNode extends NodeBase {
059    private ArrayList<Node> children=new ArrayList<Node>();
060    private int numOfLeaves;
061        
062    /** Construct an InnerNode from a path-like string */
063    InnerNode(String path) {
064      super(path);
065    }
066        
067    /** Construct an InnerNode from its name and its network location */
068    InnerNode(String name, String location) {
069      super(name, location);
070    }
071        
072    /** Construct an InnerNode
073     * from its name, its network location, its parent, and its level */
074    InnerNode(String name, String location, InnerNode parent, int level) {
075      super(name, location, parent, level);
076    }
077        
078    /** @return its children */
079    Collection<Node> getChildren() {return children;}
080        
081    /** @return the number of children this node has */
082    int getNumOfChildren() {
083      return children.size();
084    }
085        
086    /** Judge if this node represents a rack 
087     * @return true if it has no child or its children are not InnerNodes
088     */ 
089    boolean isRack() {
090      if (children.isEmpty()) {
091        return true;
092      }
093            
094      Node firstChild = children.get(0);
095      if (firstChild instanceof InnerNode) {
096        return false;
097      }
098            
099      return true;
100    }
101        
102    /** Judge if this node is an ancestor of node <i>n</i>
103     * 
104     * @param n a node
105     * @return true if this node is an ancestor of <i>n</i>
106     */
107    boolean isAncestor(Node n) {
108      return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) ||
109        (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR).
110        startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR);
111    }
112        
113    /** Judge if this node is the parent of node <i>n</i>
114     * 
115     * @param n a node
116     * @return true if this node is the parent of <i>n</i>
117     */
118    boolean isParent(Node n) {
119      return n.getNetworkLocation().equals(getPath(this));
120    }
121        
122    /* Return a child name of this node who is an ancestor of node <i>n</i> */
123    private String getNextAncestorName(Node n) {
124      if (!isAncestor(n)) {
125        throw new IllegalArgumentException(
126                                           this + "is not an ancestor of " + n);
127      }
128      String name = n.getNetworkLocation().substring(getPath(this).length());
129      if (name.charAt(0) == PATH_SEPARATOR) {
130        name = name.substring(1);
131      }
132      int index=name.indexOf(PATH_SEPARATOR);
133      if (index !=-1)
134        name = name.substring(0, index);
135      return name;
136    }
137        
138    /** Add node <i>n</i> to the subtree of this node 
139     * @param n node to be added
140     * @return true if the node is added; false otherwise
141     */
142    boolean add(Node n) {
143      if (!isAncestor(n))
144        throw new IllegalArgumentException(n.getName()+", which is located at "
145                +n.getNetworkLocation()+", is not a decendent of "
146                +getPath(this));
147      if (isParent(n)) {
148        // this node is the parent of n; add n directly
149        n.setParent(this);
150        n.setLevel(this.level+1);
151        for(int i=0; i<children.size(); i++) {
152          if (children.get(i).getName().equals(n.getName())) {
153            children.set(i, n);
154            return false;
155          }
156        }
157        children.add(n);
158        numOfLeaves++;
159        return true;
160      } else {
161        // find the next ancestor node
162        String parentName = getNextAncestorName(n);
163        InnerNode parentNode = null;
164        for(int i=0; i<children.size(); i++) {
165          if (children.get(i).getName().equals(parentName)) {
166            parentNode = (InnerNode)children.get(i);
167            break;
168          }
169        }
170        if (parentNode == null) {
171          // create a new InnerNode
172          parentNode = new InnerNode(parentName, getPath(this),
173                                     this, this.getLevel()+1);
174          children.add(parentNode);
175        }
176        // add n to the subtree of the next ancestor node
177        if (parentNode.add(n)) {
178          numOfLeaves++;
179          return true;
180        } else {
181          return false;
182        }
183      }
184    }
185        
186    /** Remove node <i>n</i> from the subtree of this node
187     * @param n node to be deleted 
188     * @return true if the node is deleted; false otherwise
189     */
190    boolean remove(Node n) {
191      String parent = n.getNetworkLocation();
192      String currentPath = getPath(this);
193      if (!isAncestor(n))
194        throw new IllegalArgumentException(n.getName()
195                                           +", which is located at "
196                                           +parent+", is not a descendent of "+currentPath);
197      if (isParent(n)) {
198        // this node is the parent of n; remove n directly
199        for(int i=0; i<children.size(); i++) {
200          if (children.get(i).getName().equals(n.getName())) {
201            children.remove(i);
202            numOfLeaves--;
203            n.setParent(null);
204            return true;
205          }
206        }
207        return false;
208      } else {
209        // find the next ancestor node: the parent node
210        String parentName = getNextAncestorName(n);
211        InnerNode parentNode = null;
212        int i;
213        for(i=0; i<children.size(); i++) {
214          if (children.get(i).getName().equals(parentName)) {
215            parentNode = (InnerNode)children.get(i);
216            break;
217          }
218        }
219        if (parentNode==null) {
220          return false;
221        }
222        // remove n from the parent node
223        boolean isRemoved = parentNode.remove(n);
224        // if the parent node has no children, remove the parent node too
225        if (isRemoved) {
226          if (parentNode.getNumOfChildren() == 0) {
227            children.remove(i);
228          }
229          numOfLeaves--;
230        }
231        return isRemoved;
232      }
233    } // end of remove
234        
235    /** Given a node's string representation, return a reference to the node
236     * @param loc string location of the form /rack/node
237     * @return null if the node is not found or the childnode is there but
238     * not an instance of {@link InnerNode}
239     */
240    private Node getLoc(String loc) {
241      if (loc == null || loc.length() == 0) return this;
242            
243      String[] path = loc.split(PATH_SEPARATOR_STR, 2);
244      Node childnode = null;
245      for(int i=0; i<children.size(); i++) {
246        if (children.get(i).getName().equals(path[0])) {
247          childnode = children.get(i);
248        }
249      }
250      if (childnode == null) return null; // non-existing node
251      if (path.length == 1) return childnode;
252      if (childnode instanceof InnerNode) {
253        return ((InnerNode)childnode).getLoc(path[1]);
254      } else {
255        return null;
256      }
257    }
258        
259    /** get <i>leafIndex</i> leaf of this subtree 
260     * if it is not in the <i>excludedNode</i>
261     *
262     * @param leafIndex an indexed leaf of the node
263     * @param excludedNode an excluded node (can be null)
264     * @return
265     */
266    private Node getLeaf(int leafIndex, Node excludedNode) {
267      int count=0;
268      // check if the excluded node a leaf
269      boolean isLeaf =
270        excludedNode == null || !(excludedNode instanceof InnerNode);
271      // calculate the total number of excluded leaf nodes
272      int numOfExcludedLeaves =
273        isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves();
274      if (isRack()) { // children are leaves
275        if (isLeaf) { // excluded node is a leaf node
276          int excludedIndex = children.indexOf(excludedNode);
277          if (excludedIndex != -1 && leafIndex >= 0) {
278            // excluded node is one of the children so adjust the leaf index
279            leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex;
280          }
281        }
282        // range check
283        if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) {
284          return null;
285        }
286        return children.get(leafIndex);
287      } else {
288        for(int i=0; i<children.size(); i++) {
289          InnerNode child = (InnerNode)children.get(i);
290          if (excludedNode == null || excludedNode != child) {
291            // not the excludedNode
292            int numOfLeaves = child.getNumOfLeaves();
293            if (excludedNode != null && child.isAncestor(excludedNode)) {
294              numOfLeaves -= numOfExcludedLeaves;
295            }
296            if (count+numOfLeaves > leafIndex) {
297              // the leaf is in the child subtree
298              return child.getLeaf(leafIndex-count, excludedNode);
299            } else {
300              // go to the next child
301              count = count+numOfLeaves;
302            }
303          } else { // it is the excluededNode
304            // skip it and set the excludedNode to be null
305            excludedNode = null;
306          }
307        }
308        return null;
309      }
310    }
311        
312    int getNumOfLeaves() {
313      return numOfLeaves;
314    }
315  } // end of InnerNode
316
317  /**
318   * the root cluster map
319   */
320  InnerNode clusterMap = new InnerNode(InnerNode.ROOT);
321  /** Depth of all leaf nodes */
322  private int depthOfAllLeaves = -1;
323  /** rack counter */
324  private int numOfRacks = 0;
325  /** the lock used to manage access */
326  private ReadWriteLock netlock;
327    
328  public NetworkTopology() {
329    netlock = new ReentrantReadWriteLock();
330  }
331    
332  /** Add a leaf node
333   * Update node counter & rack counter if necessary
334   * @param node node to be added; can be null
335   * @exception IllegalArgumentException if add a node to a leave 
336                                         or node to be added is not a leaf
337   */
338  public void add(Node node) {
339    if (node==null) return;
340    String oldTopoStr = this.toString();
341    if( node instanceof InnerNode ) {
342      throw new IllegalArgumentException(
343        "Not allow to add an inner node: "+NodeBase.getPath(node));
344    }
345    netlock.writeLock().lock();
346    try {
347      Node rack = getNode(node.getNetworkLocation());
348      if (rack != null && !(rack instanceof InnerNode)) {
349        throw new IllegalArgumentException("Unexpected data node " 
350                                           + node.toString() 
351                                           + " at an illegal network location");
352      }
353      if (clusterMap.add(node)) {
354        LOG.info("Adding a new node: "+NodeBase.getPath(node));
355        if (rack == null) {
356          numOfRacks++;
357        }
358        if (!(node instanceof InnerNode)) {
359          if (depthOfAllLeaves == -1) {
360            depthOfAllLeaves = node.getLevel();
361          } else {
362            if (depthOfAllLeaves != node.getLevel()) {
363              LOG.error("Error: can't add leaf node at depth " +
364                  node.getLevel() + " to topology:\n" + oldTopoStr);
365              throw new InvalidTopologyException("Invalid network topology. " +
366                  "You cannot have a rack and a non-rack node at the same " +
367                  "level of the network topology.");
368            }
369          }
370        }
371      }
372      if(LOG.isDebugEnabled()) {
373        LOG.debug("NetworkTopology became:\n" + this.toString());
374      }
375    } finally {
376      netlock.writeLock().unlock();
377    }
378  }
379    
380  /** Remove a node
381   * Update node counter and rack counter if necessary
382   * @param node node to be removed; can be null
383   */ 
384  public void remove(Node node) {
385    if (node==null) return;
386    if( node instanceof InnerNode ) {
387      throw new IllegalArgumentException(
388        "Not allow to remove an inner node: "+NodeBase.getPath(node));
389    }
390    LOG.info("Removing a node: "+NodeBase.getPath(node));
391    netlock.writeLock().lock();
392    try {
393      if (clusterMap.remove(node)) {
394        InnerNode rack = (InnerNode)getNode(node.getNetworkLocation());
395        if (rack == null) {
396          numOfRacks--;
397        }
398      }
399      if(LOG.isDebugEnabled()) {
400        LOG.debug("NetworkTopology became:\n" + this.toString());
401      }
402    } finally {
403      netlock.writeLock().unlock();
404    }
405  }
406       
407  /** Check if the tree contains node <i>node</i>
408   * 
409   * @param node a node
410   * @return true if <i>node</i> is already in the tree; false otherwise
411   */
412  public boolean contains(Node node) {
413    if (node == null) return false;
414    netlock.readLock().lock();
415    try {
416      Node parent = node.getParent();
417      for (int level = node.getLevel(); parent != null && level > 0;
418           parent = parent.getParent(), level--) {
419        if (parent == clusterMap) {
420          return true;
421        }
422      }
423    } finally {
424      netlock.readLock().unlock();
425    }
426    return false; 
427  }
428    
429  /** Given a string representation of a node, return its reference
430   * 
431   * @param loc
432   *          a path-like string representation of a node
433   * @return a reference to the node; null if the node is not in the tree
434   */
435  public Node getNode(String loc) {
436    netlock.readLock().lock();
437    try {
438      loc = NodeBase.normalize(loc);
439      if (!NodeBase.ROOT.equals(loc))
440        loc = loc.substring(1);
441      return clusterMap.getLoc(loc);
442    } finally {
443      netlock.readLock().unlock();
444    }
445  }
446    
447  /** @return the total number of racks */
448  public int getNumOfRacks() {
449    netlock.readLock().lock();
450    try {
451      return numOfRacks;
452    } finally {
453      netlock.readLock().unlock();
454    }
455  }
456    
457  /** @return the total number of leaf nodes */
458  public int getNumOfLeaves() {
459    netlock.readLock().lock();
460    try {
461      return clusterMap.getNumOfLeaves();
462    } finally {
463      netlock.readLock().unlock();
464    }
465  }
466    
467  /** Return the distance between two nodes
468   * It is assumed that the distance from one node to its parent is 1
469   * The distance between two nodes is calculated by summing up their distances
470   * to their closest common ancestor.
471   * @param node1 one node
472   * @param node2 another node
473   * @return the distance between node1 and node2 which is zero if they are the same
474   *  or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster
475   */
476  public int getDistance(Node node1, Node node2) {
477    if (node1 == node2) {
478      return 0;
479    }
480    Node n1=node1, n2=node2;
481    int dis = 0;
482    netlock.readLock().lock();
483    try {
484      int level1=node1.getLevel(), level2=node2.getLevel();
485      while(n1!=null && level1>level2) {
486        n1 = n1.getParent();
487        level1--;
488        dis++;
489      }
490      while(n2!=null && level2>level1) {
491        n2 = n2.getParent();
492        level2--;
493        dis++;
494      }
495      while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) {
496        n1=n1.getParent();
497        n2=n2.getParent();
498        dis+=2;
499      }
500    } finally {
501      netlock.readLock().unlock();
502    }
503    if (n1==null) {
504      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1));
505      return Integer.MAX_VALUE;
506    }
507    if (n2==null) {
508      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2));
509      return Integer.MAX_VALUE;
510    }
511    return dis+2;
512  } 
513    
514  /** Check if two nodes are on the same rack
515   * @param node1 one node (can be null)
516   * @param node2 another node (can be null)
517   * @return true if node1 and node2 are on the same rack; false otherwise
518   * @exception IllegalArgumentException when either node1 or node2 is null, or
519   * node1 or node2 do not belong to the cluster
520   */
521  public boolean isOnSameRack( Node node1,  Node node2) {
522    if (node1 == null || node2 == null) {
523      return false;
524    }
525      
526    netlock.readLock().lock();
527    try {
528      return node1.getParent()==node2.getParent();
529    } finally {
530      netlock.readLock().unlock();
531    }
532  }
533    
534  final private static Random r = new Random();
535  /** randomly choose one node from <i>scope</i>
536   * if scope starts with ~, choose one from the all nodes except for the
537   * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
538   * @param scope range of nodes from which a node will be chosen
539   * @return the chosen node
540   */
541  public Node chooseRandom(String scope) {
542    netlock.readLock().lock();
543    try {
544      if (scope.startsWith("~")) {
545        return chooseRandom(NodeBase.ROOT, scope.substring(1));
546      } else {
547        return chooseRandom(scope, null);
548      }
549    } finally {
550      netlock.readLock().unlock();
551    }
552  }
553    
554  private Node chooseRandom(String scope, String excludedScope){
555    if (excludedScope != null) {
556      if (scope.startsWith(excludedScope)) {
557        return null;
558      }
559      if (!excludedScope.startsWith(scope)) {
560        excludedScope = null;
561      }
562    }
563    Node node = getNode(scope);
564    if (!(node instanceof InnerNode)) {
565      return node;
566    }
567    InnerNode innerNode = (InnerNode)node;
568    int numOfDatanodes = innerNode.getNumOfLeaves();
569    if (excludedScope == null) {
570      node = null;
571    } else {
572      node = getNode(excludedScope);
573      if (!(node instanceof InnerNode)) {
574        numOfDatanodes -= 1;
575      } else {
576        numOfDatanodes -= ((InnerNode)node).getNumOfLeaves();
577      }
578    }
579    int leaveIndex = r.nextInt(numOfDatanodes);
580    return innerNode.getLeaf(leaveIndex, node);
581  }
582       
583  /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i>
584   * if scope starts with ~, return the number of nodes that are not
585   * in <i>scope</i> and <i>excludedNodes</i>; 
586   * @param scope a path string that may start with ~
587   * @param excludedNodes a list of nodes
588   * @return number of available nodes
589   */
590  public int countNumOfAvailableNodes(String scope,
591                                      Collection<Node> excludedNodes) {
592    boolean isExcluded=false;
593    if (scope.startsWith("~")) {
594      isExcluded=true;
595      scope=scope.substring(1);
596    }
597    scope = NodeBase.normalize(scope);
598    int count=0; // the number of nodes in both scope & excludedNodes
599    netlock.readLock().lock();
600    try {
601      for(Node node:excludedNodes) {
602        if ((NodeBase.getPath(node)+NodeBase.PATH_SEPARATOR_STR).
603            startsWith(scope+NodeBase.PATH_SEPARATOR_STR)) {
604          count++;
605        }
606      }
607      Node n=getNode(scope);
608      int scopeNodeCount=1;
609      if (n instanceof InnerNode) {
610        scopeNodeCount=((InnerNode)n).getNumOfLeaves();
611      }
612      if (isExcluded) {
613        return clusterMap.getNumOfLeaves()-
614          scopeNodeCount-excludedNodes.size()+count;
615      } else {
616        return scopeNodeCount-count;
617      }
618    } finally {
619      netlock.readLock().unlock();
620    }
621  }
622    
623  /** convert a network tree to a string */
624  @Override
625  public String toString() {
626    // print the number of racks
627    StringBuilder tree = new StringBuilder();
628    tree.append("Number of racks: ");
629    tree.append(numOfRacks);
630    tree.append("\n");
631    // print the number of leaves
632    int numOfLeaves = getNumOfLeaves();
633    tree.append("Expected number of leaves:");
634    tree.append(numOfLeaves);
635    tree.append("\n");
636    // print nodes
637    for(int i=0; i<numOfLeaves; i++) {
638      tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null)));
639      tree.append("\n");
640    }
641    return tree.toString();
642  }
643
644  /* swap two array items */
645  static private void swap(Node[] nodes, int i, int j) {
646    Node tempNode;
647    tempNode = nodes[j];
648    nodes[j] = nodes[i];
649    nodes[i] = tempNode;
650    
651  }
652  
653  /** Sort nodes array by their distances to <i>reader</i>
654   * It linearly scans the array, if a local node is found, swap it with
655   * the first element of the array.
656   * If a local rack node is found, swap it with the first element following
657   * the local node.
658   * If neither local node or local rack node is found, put a random replica
659   * location at position 0.
660   * It leaves the rest nodes untouched.
661   * @param reader the node that wishes to read a block from one of the nodes
662   * @param nodes the list of nodes containing data for the reader
663   */
664  public void pseudoSortByDistance( Node reader, Node[] nodes ) {
665    int tempIndex = 0;
666    int localRackNode = -1;
667    if (reader != null ) {
668      //scan the array to find the local node & local rack node
669      for(int i=0; i<nodes.length; i++) {
670        if(tempIndex == 0 && reader == nodes[i]) { //local node
671          //swap the local node and the node at position 0
672          if( i != 0 ) {
673            swap(nodes, tempIndex, i);
674          }
675          tempIndex=1;
676          if(localRackNode != -1 ) {
677            if(localRackNode == 0) {
678              localRackNode = i;
679            }
680            break;
681          }
682        } else if(localRackNode == -1 && isOnSameRack(reader, nodes[i])) {
683          //local rack
684          localRackNode = i;
685          if(tempIndex != 0 ) break;
686        }
687      }
688
689      // swap the local rack node and the node at position tempIndex
690      if(localRackNode != -1 && localRackNode != tempIndex ) {
691        swap(nodes, tempIndex, localRackNode);
692        tempIndex++;
693      }
694    }
695    
696    // put a random node at position 0 if it is not a local/local-rack node
697    if(tempIndex == 0 && localRackNode == -1 && nodes.length != 0) {
698      swap(nodes, 0, r.nextInt(nodes.length));
699    }
700  }
701}