001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.net;
019    
020    import java.util.ArrayList;
021    import java.util.Collection;
022    import java.util.Random;
023    import java.util.concurrent.locks.ReadWriteLock;
024    import java.util.concurrent.locks.ReentrantReadWriteLock;
025    
026    import org.apache.commons.logging.Log;
027    import org.apache.commons.logging.LogFactory;
028    import org.apache.hadoop.classification.InterfaceAudience;
029    import org.apache.hadoop.classification.InterfaceStability;
030    
031    /** The class represents a cluster of computer with a tree hierarchical
032     * network topology.
033     * For example, a cluster may be consists of many data centers filled 
034     * with racks of computers.
035     * In a network topology, leaves represent data nodes (computers) and inner
036     * nodes represent switches/routers that manage traffic in/out of data centers
037     * or racks.  
038     * 
039     */
040    @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
041    @InterfaceStability.Unstable
042    public class NetworkTopology {
043      public final static String DEFAULT_RACK = "/default-rack";
044      public final static int DEFAULT_HOST_LEVEL = 2;
045      public static final Log LOG = 
046        LogFactory.getLog(NetworkTopology.class);
047        
048      public static class InvalidTopologyException extends RuntimeException {
049        private static final long serialVersionUID = 1L;
050        public InvalidTopologyException(String msg) {
051          super(msg);
052        }
053      }
054    
055      /** InnerNode represents a switch/router of a data center or rack.
056       * Different from a leaf node, it has non-null children.
057       */
058      private class InnerNode extends NodeBase {
059        private ArrayList<Node> children=new ArrayList<Node>();
060        private int numOfLeaves;
061            
062        /** Construct an InnerNode from a path-like string */
063        InnerNode(String path) {
064          super(path);
065        }
066            
067        /** Construct an InnerNode from its name and its network location */
068        InnerNode(String name, String location) {
069          super(name, location);
070        }
071            
072        /** Construct an InnerNode
073         * from its name, its network location, its parent, and its level */
074        InnerNode(String name, String location, InnerNode parent, int level) {
075          super(name, location, parent, level);
076        }
077            
078        /** @return its children */
079        Collection<Node> getChildren() {return children;}
080            
081        /** @return the number of children this node has */
082        int getNumOfChildren() {
083          return children.size();
084        }
085            
086        /** Judge if this node represents a rack 
087         * @return true if it has no child or its children are not InnerNodes
088         */ 
089        boolean isRack() {
090          if (children.isEmpty()) {
091            return true;
092          }
093                
094          Node firstChild = children.get(0);
095          if (firstChild instanceof InnerNode) {
096            return false;
097          }
098                
099          return true;
100        }
101            
102        /** Judge if this node is an ancestor of node <i>n</i>
103         * 
104         * @param n a node
105         * @return true if this node is an ancestor of <i>n</i>
106         */
107        boolean isAncestor(Node n) {
108          return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) ||
109            (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR).
110            startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR);
111        }
112            
113        /** Judge if this node is the parent of node <i>n</i>
114         * 
115         * @param n a node
116         * @return true if this node is the parent of <i>n</i>
117         */
118        boolean isParent(Node n) {
119          return n.getNetworkLocation().equals(getPath(this));
120        }
121            
122        /* Return a child name of this node who is an ancestor of node <i>n</i> */
123        private String getNextAncestorName(Node n) {
124          if (!isAncestor(n)) {
125            throw new IllegalArgumentException(
126                                               this + "is not an ancestor of " + n);
127          }
128          String name = n.getNetworkLocation().substring(getPath(this).length());
129          if (name.charAt(0) == PATH_SEPARATOR) {
130            name = name.substring(1);
131          }
132          int index=name.indexOf(PATH_SEPARATOR);
133          if (index !=-1)
134            name = name.substring(0, index);
135          return name;
136        }
137            
138        /** Add node <i>n</i> to the subtree of this node 
139         * @param n node to be added
140         * @return true if the node is added; false otherwise
141         */
142        boolean add(Node n) {
143          if (!isAncestor(n))
144            throw new IllegalArgumentException(n.getName()+", which is located at "
145                    +n.getNetworkLocation()+", is not a decendent of "
146                    +getPath(this));
147          if (isParent(n)) {
148            // this node is the parent of n; add n directly
149            n.setParent(this);
150            n.setLevel(this.level+1);
151            for(int i=0; i<children.size(); i++) {
152              if (children.get(i).getName().equals(n.getName())) {
153                children.set(i, n);
154                return false;
155              }
156            }
157            children.add(n);
158            numOfLeaves++;
159            return true;
160          } else {
161            // find the next ancestor node
162            String parentName = getNextAncestorName(n);
163            InnerNode parentNode = null;
164            for(int i=0; i<children.size(); i++) {
165              if (children.get(i).getName().equals(parentName)) {
166                parentNode = (InnerNode)children.get(i);
167                break;
168              }
169            }
170            if (parentNode == null) {
171              // create a new InnerNode
172              parentNode = new InnerNode(parentName, getPath(this),
173                                         this, this.getLevel()+1);
174              children.add(parentNode);
175            }
176            // add n to the subtree of the next ancestor node
177            if (parentNode.add(n)) {
178              numOfLeaves++;
179              return true;
180            } else {
181              return false;
182            }
183          }
184        }
185            
186        /** Remove node <i>n</i> from the subtree of this node
187         * @param n node to be deleted 
188         * @return true if the node is deleted; false otherwise
189         */
190        boolean remove(Node n) {
191          String parent = n.getNetworkLocation();
192          String currentPath = getPath(this);
193          if (!isAncestor(n))
194            throw new IllegalArgumentException(n.getName()
195                                               +", which is located at "
196                                               +parent+", is not a descendent of "+currentPath);
197          if (isParent(n)) {
198            // this node is the parent of n; remove n directly
199            for(int i=0; i<children.size(); i++) {
200              if (children.get(i).getName().equals(n.getName())) {
201                children.remove(i);
202                numOfLeaves--;
203                n.setParent(null);
204                return true;
205              }
206            }
207            return false;
208          } else {
209            // find the next ancestor node: the parent node
210            String parentName = getNextAncestorName(n);
211            InnerNode parentNode = null;
212            int i;
213            for(i=0; i<children.size(); i++) {
214              if (children.get(i).getName().equals(parentName)) {
215                parentNode = (InnerNode)children.get(i);
216                break;
217              }
218            }
219            if (parentNode==null) {
220              return false;
221            }
222            // remove n from the parent node
223            boolean isRemoved = parentNode.remove(n);
224            // if the parent node has no children, remove the parent node too
225            if (isRemoved) {
226              if (parentNode.getNumOfChildren() == 0) {
227                children.remove(i);
228              }
229              numOfLeaves--;
230            }
231            return isRemoved;
232          }
233        } // end of remove
234            
235        /** Given a node's string representation, return a reference to the node
236         * @param loc string location of the form /rack/node
237         * @return null if the node is not found or the childnode is there but
238         * not an instance of {@link InnerNode}
239         */
240        private Node getLoc(String loc) {
241          if (loc == null || loc.length() == 0) return this;
242                
243          String[] path = loc.split(PATH_SEPARATOR_STR, 2);
244          Node childnode = null;
245          for(int i=0; i<children.size(); i++) {
246            if (children.get(i).getName().equals(path[0])) {
247              childnode = children.get(i);
248            }
249          }
250          if (childnode == null) return null; // non-existing node
251          if (path.length == 1) return childnode;
252          if (childnode instanceof InnerNode) {
253            return ((InnerNode)childnode).getLoc(path[1]);
254          } else {
255            return null;
256          }
257        }
258            
259        /** get <i>leafIndex</i> leaf of this subtree 
260         * if it is not in the <i>excludedNode</i>
261         *
262         * @param leafIndex an indexed leaf of the node
263         * @param excludedNode an excluded node (can be null)
264         * @return
265         */
266        private Node getLeaf(int leafIndex, Node excludedNode) {
267          int count=0;
268          // check if the excluded node a leaf
269          boolean isLeaf =
270            excludedNode == null || !(excludedNode instanceof InnerNode);
271          // calculate the total number of excluded leaf nodes
272          int numOfExcludedLeaves =
273            isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves();
274          if (isRack()) { // children are leaves
275            if (isLeaf) { // excluded node is a leaf node
276              int excludedIndex = children.indexOf(excludedNode);
277              if (excludedIndex != -1 && leafIndex >= 0) {
278                // excluded node is one of the children so adjust the leaf index
279                leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex;
280              }
281            }
282            // range check
283            if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) {
284              return null;
285            }
286            return children.get(leafIndex);
287          } else {
288            for(int i=0; i<children.size(); i++) {
289              InnerNode child = (InnerNode)children.get(i);
290              if (excludedNode == null || excludedNode != child) {
291                // not the excludedNode
292                int numOfLeaves = child.getNumOfLeaves();
293                if (excludedNode != null && child.isAncestor(excludedNode)) {
294                  numOfLeaves -= numOfExcludedLeaves;
295                }
296                if (count+numOfLeaves > leafIndex) {
297                  // the leaf is in the child subtree
298                  return child.getLeaf(leafIndex-count, excludedNode);
299                } else {
300                  // go to the next child
301                  count = count+numOfLeaves;
302                }
303              } else { // it is the excluededNode
304                // skip it and set the excludedNode to be null
305                excludedNode = null;
306              }
307            }
308            return null;
309          }
310        }
311            
312        int getNumOfLeaves() {
313          return numOfLeaves;
314        }
315      } // end of InnerNode
316    
317      /**
318       * the root cluster map
319       */
320      InnerNode clusterMap = new InnerNode(InnerNode.ROOT);
321      /** Depth of all leaf nodes */
322      private int depthOfAllLeaves = -1;
323      /** rack counter */
324      private int numOfRacks = 0;
325      /** the lock used to manage access */
326      private ReadWriteLock netlock;
327        
328      public NetworkTopology() {
329        netlock = new ReentrantReadWriteLock();
330      }
331        
332      /** Add a leaf node
333       * Update node counter & rack counter if necessary
334       * @param node node to be added; can be null
335       * @exception IllegalArgumentException if add a node to a leave 
336                                             or node to be added is not a leaf
337       */
338      public void add(Node node) {
339        if (node==null) return;
340        String oldTopoStr = this.toString();
341        if( node instanceof InnerNode ) {
342          throw new IllegalArgumentException(
343            "Not allow to add an inner node: "+NodeBase.getPath(node));
344        }
345        netlock.writeLock().lock();
346        try {
347          Node rack = getNode(node.getNetworkLocation());
348          if (rack != null && !(rack instanceof InnerNode)) {
349            throw new IllegalArgumentException("Unexpected data node " 
350                                               + node.toString() 
351                                               + " at an illegal network location");
352          }
353          if (clusterMap.add(node)) {
354            LOG.info("Adding a new node: "+NodeBase.getPath(node));
355            if (rack == null) {
356              numOfRacks++;
357            }
358            if (!(node instanceof InnerNode)) {
359              if (depthOfAllLeaves == -1) {
360                depthOfAllLeaves = node.getLevel();
361              } else {
362                if (depthOfAllLeaves != node.getLevel()) {
363                  LOG.error("Error: can't add leaf node at depth " +
364                      node.getLevel() + " to topology:\n" + oldTopoStr);
365                  throw new InvalidTopologyException("Invalid network topology. " +
366                      "You cannot have a rack and a non-rack node at the same " +
367                      "level of the network topology.");
368                }
369              }
370            }
371          }
372          if(LOG.isDebugEnabled()) {
373            LOG.debug("NetworkTopology became:\n" + this.toString());
374          }
375        } finally {
376          netlock.writeLock().unlock();
377        }
378      }
379        
380      /** Remove a node
381       * Update node counter and rack counter if necessary
382       * @param node node to be removed; can be null
383       */ 
384      public void remove(Node node) {
385        if (node==null) return;
386        if( node instanceof InnerNode ) {
387          throw new IllegalArgumentException(
388            "Not allow to remove an inner node: "+NodeBase.getPath(node));
389        }
390        LOG.info("Removing a node: "+NodeBase.getPath(node));
391        netlock.writeLock().lock();
392        try {
393          if (clusterMap.remove(node)) {
394            InnerNode rack = (InnerNode)getNode(node.getNetworkLocation());
395            if (rack == null) {
396              numOfRacks--;
397            }
398          }
399          if(LOG.isDebugEnabled()) {
400            LOG.debug("NetworkTopology became:\n" + this.toString());
401          }
402        } finally {
403          netlock.writeLock().unlock();
404        }
405      }
406           
407      /** Check if the tree contains node <i>node</i>
408       * 
409       * @param node a node
410       * @return true if <i>node</i> is already in the tree; false otherwise
411       */
412      public boolean contains(Node node) {
413        if (node == null) return false;
414        netlock.readLock().lock();
415        try {
416          Node parent = node.getParent();
417          for (int level = node.getLevel(); parent != null && level > 0;
418               parent = parent.getParent(), level--) {
419            if (parent == clusterMap) {
420              return true;
421            }
422          }
423        } finally {
424          netlock.readLock().unlock();
425        }
426        return false; 
427      }
428        
429      /** Given a string representation of a node, return its reference
430       * 
431       * @param loc
432       *          a path-like string representation of a node
433       * @return a reference to the node; null if the node is not in the tree
434       */
435      public Node getNode(String loc) {
436        netlock.readLock().lock();
437        try {
438          loc = NodeBase.normalize(loc);
439          if (!NodeBase.ROOT.equals(loc))
440            loc = loc.substring(1);
441          return clusterMap.getLoc(loc);
442        } finally {
443          netlock.readLock().unlock();
444        }
445      }
446        
447      /** @return the total number of racks */
448      public int getNumOfRacks() {
449        netlock.readLock().lock();
450        try {
451          return numOfRacks;
452        } finally {
453          netlock.readLock().unlock();
454        }
455      }
456        
457      /** @return the total number of leaf nodes */
458      public int getNumOfLeaves() {
459        netlock.readLock().lock();
460        try {
461          return clusterMap.getNumOfLeaves();
462        } finally {
463          netlock.readLock().unlock();
464        }
465      }
466        
467      /** Return the distance between two nodes
468       * It is assumed that the distance from one node to its parent is 1
469       * The distance between two nodes is calculated by summing up their distances
470       * to their closest common ancestor.
471       * @param node1 one node
472       * @param node2 another node
473       * @return the distance between node1 and node2 which is zero if they are the same
474       *  or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster
475       */
476      public int getDistance(Node node1, Node node2) {
477        if (node1 == node2) {
478          return 0;
479        }
480        Node n1=node1, n2=node2;
481        int dis = 0;
482        netlock.readLock().lock();
483        try {
484          int level1=node1.getLevel(), level2=node2.getLevel();
485          while(n1!=null && level1>level2) {
486            n1 = n1.getParent();
487            level1--;
488            dis++;
489          }
490          while(n2!=null && level2>level1) {
491            n2 = n2.getParent();
492            level2--;
493            dis++;
494          }
495          while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) {
496            n1=n1.getParent();
497            n2=n2.getParent();
498            dis+=2;
499          }
500        } finally {
501          netlock.readLock().unlock();
502        }
503        if (n1==null) {
504          LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1));
505          return Integer.MAX_VALUE;
506        }
507        if (n2==null) {
508          LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2));
509          return Integer.MAX_VALUE;
510        }
511        return dis+2;
512      } 
513        
514      /** Check if two nodes are on the same rack
515       * @param node1 one node (can be null)
516       * @param node2 another node (can be null)
517       * @return true if node1 and node2 are on the same rack; false otherwise
518       * @exception IllegalArgumentException when either node1 or node2 is null, or
519       * node1 or node2 do not belong to the cluster
520       */
521      public boolean isOnSameRack( Node node1,  Node node2) {
522        if (node1 == null || node2 == null) {
523          return false;
524        }
525          
526        netlock.readLock().lock();
527        try {
528          return node1.getParent()==node2.getParent();
529        } finally {
530          netlock.readLock().unlock();
531        }
532      }
533        
534      final private static Random r = new Random();
535      /** randomly choose one node from <i>scope</i>
536       * if scope starts with ~, choose one from the all nodes except for the
537       * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
538       * @param scope range of nodes from which a node will be chosen
539       * @return the chosen node
540       */
541      public Node chooseRandom(String scope) {
542        netlock.readLock().lock();
543        try {
544          if (scope.startsWith("~")) {
545            return chooseRandom(NodeBase.ROOT, scope.substring(1));
546          } else {
547            return chooseRandom(scope, null);
548          }
549        } finally {
550          netlock.readLock().unlock();
551        }
552      }
553        
554      private Node chooseRandom(String scope, String excludedScope){
555        if (excludedScope != null) {
556          if (scope.startsWith(excludedScope)) {
557            return null;
558          }
559          if (!excludedScope.startsWith(scope)) {
560            excludedScope = null;
561          }
562        }
563        Node node = getNode(scope);
564        if (!(node instanceof InnerNode)) {
565          return node;
566        }
567        InnerNode innerNode = (InnerNode)node;
568        int numOfDatanodes = innerNode.getNumOfLeaves();
569        if (excludedScope == null) {
570          node = null;
571        } else {
572          node = getNode(excludedScope);
573          if (!(node instanceof InnerNode)) {
574            numOfDatanodes -= 1;
575          } else {
576            numOfDatanodes -= ((InnerNode)node).getNumOfLeaves();
577          }
578        }
579        int leaveIndex = r.nextInt(numOfDatanodes);
580        return innerNode.getLeaf(leaveIndex, node);
581      }
582           
583      /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i>
584       * if scope starts with ~, return the number of nodes that are not
585       * in <i>scope</i> and <i>excludedNodes</i>; 
586       * @param scope a path string that may start with ~
587       * @param excludedNodes a list of nodes
588       * @return number of available nodes
589       */
590      public int countNumOfAvailableNodes(String scope,
591                                          Collection<Node> excludedNodes) {
592        boolean isExcluded=false;
593        if (scope.startsWith("~")) {
594          isExcluded=true;
595          scope=scope.substring(1);
596        }
597        scope = NodeBase.normalize(scope);
598        int count=0; // the number of nodes in both scope & excludedNodes
599        netlock.readLock().lock();
600        try {
601          for(Node node:excludedNodes) {
602            if ((NodeBase.getPath(node)+NodeBase.PATH_SEPARATOR_STR).
603                startsWith(scope+NodeBase.PATH_SEPARATOR_STR)) {
604              count++;
605            }
606          }
607          Node n=getNode(scope);
608          int scopeNodeCount=1;
609          if (n instanceof InnerNode) {
610            scopeNodeCount=((InnerNode)n).getNumOfLeaves();
611          }
612          if (isExcluded) {
613            return clusterMap.getNumOfLeaves()-
614              scopeNodeCount-excludedNodes.size()+count;
615          } else {
616            return scopeNodeCount-count;
617          }
618        } finally {
619          netlock.readLock().unlock();
620        }
621      }
622        
623      /** convert a network tree to a string */
624      @Override
625      public String toString() {
626        // print the number of racks
627        StringBuilder tree = new StringBuilder();
628        tree.append("Number of racks: ");
629        tree.append(numOfRacks);
630        tree.append("\n");
631        // print the number of leaves
632        int numOfLeaves = getNumOfLeaves();
633        tree.append("Expected number of leaves:");
634        tree.append(numOfLeaves);
635        tree.append("\n");
636        // print nodes
637        for(int i=0; i<numOfLeaves; i++) {
638          tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null)));
639          tree.append("\n");
640        }
641        return tree.toString();
642      }
643    
644      /* swap two array items */
645      static private void swap(Node[] nodes, int i, int j) {
646        Node tempNode;
647        tempNode = nodes[j];
648        nodes[j] = nodes[i];
649        nodes[i] = tempNode;
650        
651      }
652      
653      /** Sort nodes array by their distances to <i>reader</i>
654       * It linearly scans the array, if a local node is found, swap it with
655       * the first element of the array.
656       * If a local rack node is found, swap it with the first element following
657       * the local node.
658       * If neither local node or local rack node is found, put a random replica
659       * location at position 0.
660       * It leaves the rest nodes untouched.
661       * @param reader the node that wishes to read a block from one of the nodes
662       * @param nodes the list of nodes containing data for the reader
663       */
664      public void pseudoSortByDistance( Node reader, Node[] nodes ) {
665        int tempIndex = 0;
666        int localRackNode = -1;
667        if (reader != null ) {
668          //scan the array to find the local node & local rack node
669          for(int i=0; i<nodes.length; i++) {
670            if(tempIndex == 0 && reader == nodes[i]) { //local node
671              //swap the local node and the node at position 0
672              if( i != 0 ) {
673                swap(nodes, tempIndex, i);
674              }
675              tempIndex=1;
676              if(localRackNode != -1 ) {
677                if(localRackNode == 0) {
678                  localRackNode = i;
679                }
680                break;
681              }
682            } else if(localRackNode == -1 && isOnSameRack(reader, nodes[i])) {
683              //local rack
684              localRackNode = i;
685              if(tempIndex != 0 ) break;
686            }
687          }
688    
689          // swap the local rack node and the node at position tempIndex
690          if(localRackNode != -1 && localRackNode != tempIndex ) {
691            swap(nodes, tempIndex, localRackNode);
692            tempIndex++;
693          }
694        }
695        
696        // put a random node at position 0 if it is not a local/local-rack node
697        if(tempIndex == 0 && localRackNode == -1 && nodes.length != 0) {
698          swap(nodes, 0, r.nextInt(nodes.length));
699        }
700      }
701    }