001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.net; 019 020import java.util.ArrayList; 021import java.util.Collection; 022import java.util.Random; 023import java.util.concurrent.locks.ReadWriteLock; 024import java.util.concurrent.locks.ReentrantReadWriteLock; 025 026import org.apache.commons.logging.Log; 027import org.apache.commons.logging.LogFactory; 028import org.apache.hadoop.classification.InterfaceAudience; 029import org.apache.hadoop.classification.InterfaceStability; 030 031/** The class represents a cluster of computer with a tree hierarchical 032 * network topology. 033 * For example, a cluster may be consists of many data centers filled 034 * with racks of computers. 035 * In a network topology, leaves represent data nodes (computers) and inner 036 * nodes represent switches/routers that manage traffic in/out of data centers 037 * or racks. 038 * 039 */ 040@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) 041@InterfaceStability.Unstable 042public class NetworkTopology { 043 public final static String DEFAULT_RACK = "/default-rack"; 044 public final static int DEFAULT_HOST_LEVEL = 2; 045 public static final Log LOG = 046 LogFactory.getLog(NetworkTopology.class); 047 048 public static class InvalidTopologyException extends RuntimeException { 049 private static final long serialVersionUID = 1L; 050 public InvalidTopologyException(String msg) { 051 super(msg); 052 } 053 } 054 055 /** InnerNode represents a switch/router of a data center or rack. 056 * Different from a leaf node, it has non-null children. 057 */ 058 private class InnerNode extends NodeBase { 059 private ArrayList<Node> children=new ArrayList<Node>(); 060 private int numOfLeaves; 061 062 /** Construct an InnerNode from a path-like string */ 063 InnerNode(String path) { 064 super(path); 065 } 066 067 /** Construct an InnerNode from its name and its network location */ 068 InnerNode(String name, String location) { 069 super(name, location); 070 } 071 072 /** Construct an InnerNode 073 * from its name, its network location, its parent, and its level */ 074 InnerNode(String name, String location, InnerNode parent, int level) { 075 super(name, location, parent, level); 076 } 077 078 /** @return its children */ 079 Collection<Node> getChildren() {return children;} 080 081 /** @return the number of children this node has */ 082 int getNumOfChildren() { 083 return children.size(); 084 } 085 086 /** Judge if this node represents a rack 087 * @return true if it has no child or its children are not InnerNodes 088 */ 089 boolean isRack() { 090 if (children.isEmpty()) { 091 return true; 092 } 093 094 Node firstChild = children.get(0); 095 if (firstChild instanceof InnerNode) { 096 return false; 097 } 098 099 return true; 100 } 101 102 /** Judge if this node is an ancestor of node <i>n</i> 103 * 104 * @param n a node 105 * @return true if this node is an ancestor of <i>n</i> 106 */ 107 boolean isAncestor(Node n) { 108 return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) || 109 (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR). 110 startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR); 111 } 112 113 /** Judge if this node is the parent of node <i>n</i> 114 * 115 * @param n a node 116 * @return true if this node is the parent of <i>n</i> 117 */ 118 boolean isParent(Node n) { 119 return n.getNetworkLocation().equals(getPath(this)); 120 } 121 122 /* Return a child name of this node who is an ancestor of node <i>n</i> */ 123 private String getNextAncestorName(Node n) { 124 if (!isAncestor(n)) { 125 throw new IllegalArgumentException( 126 this + "is not an ancestor of " + n); 127 } 128 String name = n.getNetworkLocation().substring(getPath(this).length()); 129 if (name.charAt(0) == PATH_SEPARATOR) { 130 name = name.substring(1); 131 } 132 int index=name.indexOf(PATH_SEPARATOR); 133 if (index !=-1) 134 name = name.substring(0, index); 135 return name; 136 } 137 138 /** Add node <i>n</i> to the subtree of this node 139 * @param n node to be added 140 * @return true if the node is added; false otherwise 141 */ 142 boolean add(Node n) { 143 if (!isAncestor(n)) 144 throw new IllegalArgumentException(n.getName()+", which is located at " 145 +n.getNetworkLocation()+", is not a decendent of " 146 +getPath(this)); 147 if (isParent(n)) { 148 // this node is the parent of n; add n directly 149 n.setParent(this); 150 n.setLevel(this.level+1); 151 for(int i=0; i<children.size(); i++) { 152 if (children.get(i).getName().equals(n.getName())) { 153 children.set(i, n); 154 return false; 155 } 156 } 157 children.add(n); 158 numOfLeaves++; 159 return true; 160 } else { 161 // find the next ancestor node 162 String parentName = getNextAncestorName(n); 163 InnerNode parentNode = null; 164 for(int i=0; i<children.size(); i++) { 165 if (children.get(i).getName().equals(parentName)) { 166 parentNode = (InnerNode)children.get(i); 167 break; 168 } 169 } 170 if (parentNode == null) { 171 // create a new InnerNode 172 parentNode = new InnerNode(parentName, getPath(this), 173 this, this.getLevel()+1); 174 children.add(parentNode); 175 } 176 // add n to the subtree of the next ancestor node 177 if (parentNode.add(n)) { 178 numOfLeaves++; 179 return true; 180 } else { 181 return false; 182 } 183 } 184 } 185 186 /** Remove node <i>n</i> from the subtree of this node 187 * @param n node to be deleted 188 * @return true if the node is deleted; false otherwise 189 */ 190 boolean remove(Node n) { 191 String parent = n.getNetworkLocation(); 192 String currentPath = getPath(this); 193 if (!isAncestor(n)) 194 throw new IllegalArgumentException(n.getName() 195 +", which is located at " 196 +parent+", is not a descendent of "+currentPath); 197 if (isParent(n)) { 198 // this node is the parent of n; remove n directly 199 for(int i=0; i<children.size(); i++) { 200 if (children.get(i).getName().equals(n.getName())) { 201 children.remove(i); 202 numOfLeaves--; 203 n.setParent(null); 204 return true; 205 } 206 } 207 return false; 208 } else { 209 // find the next ancestor node: the parent node 210 String parentName = getNextAncestorName(n); 211 InnerNode parentNode = null; 212 int i; 213 for(i=0; i<children.size(); i++) { 214 if (children.get(i).getName().equals(parentName)) { 215 parentNode = (InnerNode)children.get(i); 216 break; 217 } 218 } 219 if (parentNode==null) { 220 return false; 221 } 222 // remove n from the parent node 223 boolean isRemoved = parentNode.remove(n); 224 // if the parent node has no children, remove the parent node too 225 if (isRemoved) { 226 if (parentNode.getNumOfChildren() == 0) { 227 children.remove(i); 228 } 229 numOfLeaves--; 230 } 231 return isRemoved; 232 } 233 } // end of remove 234 235 /** Given a node's string representation, return a reference to the node 236 * @param loc string location of the form /rack/node 237 * @return null if the node is not found or the childnode is there but 238 * not an instance of {@link InnerNode} 239 */ 240 private Node getLoc(String loc) { 241 if (loc == null || loc.length() == 0) return this; 242 243 String[] path = loc.split(PATH_SEPARATOR_STR, 2); 244 Node childnode = null; 245 for(int i=0; i<children.size(); i++) { 246 if (children.get(i).getName().equals(path[0])) { 247 childnode = children.get(i); 248 } 249 } 250 if (childnode == null) return null; // non-existing node 251 if (path.length == 1) return childnode; 252 if (childnode instanceof InnerNode) { 253 return ((InnerNode)childnode).getLoc(path[1]); 254 } else { 255 return null; 256 } 257 } 258 259 /** get <i>leafIndex</i> leaf of this subtree 260 * if it is not in the <i>excludedNode</i> 261 * 262 * @param leafIndex an indexed leaf of the node 263 * @param excludedNode an excluded node (can be null) 264 * @return 265 */ 266 private Node getLeaf(int leafIndex, Node excludedNode) { 267 int count=0; 268 // check if the excluded node a leaf 269 boolean isLeaf = 270 excludedNode == null || !(excludedNode instanceof InnerNode); 271 // calculate the total number of excluded leaf nodes 272 int numOfExcludedLeaves = 273 isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves(); 274 if (isRack()) { // children are leaves 275 if (isLeaf) { // excluded node is a leaf node 276 int excludedIndex = children.indexOf(excludedNode); 277 if (excludedIndex != -1 && leafIndex >= 0) { 278 // excluded node is one of the children so adjust the leaf index 279 leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex; 280 } 281 } 282 // range check 283 if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) { 284 return null; 285 } 286 return children.get(leafIndex); 287 } else { 288 for(int i=0; i<children.size(); i++) { 289 InnerNode child = (InnerNode)children.get(i); 290 if (excludedNode == null || excludedNode != child) { 291 // not the excludedNode 292 int numOfLeaves = child.getNumOfLeaves(); 293 if (excludedNode != null && child.isAncestor(excludedNode)) { 294 numOfLeaves -= numOfExcludedLeaves; 295 } 296 if (count+numOfLeaves > leafIndex) { 297 // the leaf is in the child subtree 298 return child.getLeaf(leafIndex-count, excludedNode); 299 } else { 300 // go to the next child 301 count = count+numOfLeaves; 302 } 303 } else { // it is the excluededNode 304 // skip it and set the excludedNode to be null 305 excludedNode = null; 306 } 307 } 308 return null; 309 } 310 } 311 312 int getNumOfLeaves() { 313 return numOfLeaves; 314 } 315 } // end of InnerNode 316 317 /** 318 * the root cluster map 319 */ 320 InnerNode clusterMap = new InnerNode(InnerNode.ROOT); 321 /** Depth of all leaf nodes */ 322 private int depthOfAllLeaves = -1; 323 /** rack counter */ 324 private int numOfRacks = 0; 325 /** the lock used to manage access */ 326 private ReadWriteLock netlock; 327 328 public NetworkTopology() { 329 netlock = new ReentrantReadWriteLock(); 330 } 331 332 /** Add a leaf node 333 * Update node counter & rack counter if necessary 334 * @param node node to be added; can be null 335 * @exception IllegalArgumentException if add a node to a leave 336 or node to be added is not a leaf 337 */ 338 public void add(Node node) { 339 if (node==null) return; 340 String oldTopoStr = this.toString(); 341 if( node instanceof InnerNode ) { 342 throw new IllegalArgumentException( 343 "Not allow to add an inner node: "+NodeBase.getPath(node)); 344 } 345 netlock.writeLock().lock(); 346 try { 347 Node rack = getNode(node.getNetworkLocation()); 348 if (rack != null && !(rack instanceof InnerNode)) { 349 throw new IllegalArgumentException("Unexpected data node " 350 + node.toString() 351 + " at an illegal network location"); 352 } 353 if (clusterMap.add(node)) { 354 LOG.info("Adding a new node: "+NodeBase.getPath(node)); 355 if (rack == null) { 356 numOfRacks++; 357 } 358 if (!(node instanceof InnerNode)) { 359 if (depthOfAllLeaves == -1) { 360 depthOfAllLeaves = node.getLevel(); 361 } else { 362 if (depthOfAllLeaves != node.getLevel()) { 363 LOG.error("Error: can't add leaf node at depth " + 364 node.getLevel() + " to topology:\n" + oldTopoStr); 365 throw new InvalidTopologyException("Invalid network topology. " + 366 "You cannot have a rack and a non-rack node at the same " + 367 "level of the network topology."); 368 } 369 } 370 } 371 } 372 if(LOG.isDebugEnabled()) { 373 LOG.debug("NetworkTopology became:\n" + this.toString()); 374 } 375 } finally { 376 netlock.writeLock().unlock(); 377 } 378 } 379 380 /** Remove a node 381 * Update node counter and rack counter if necessary 382 * @param node node to be removed; can be null 383 */ 384 public void remove(Node node) { 385 if (node==null) return; 386 if( node instanceof InnerNode ) { 387 throw new IllegalArgumentException( 388 "Not allow to remove an inner node: "+NodeBase.getPath(node)); 389 } 390 LOG.info("Removing a node: "+NodeBase.getPath(node)); 391 netlock.writeLock().lock(); 392 try { 393 if (clusterMap.remove(node)) { 394 InnerNode rack = (InnerNode)getNode(node.getNetworkLocation()); 395 if (rack == null) { 396 numOfRacks--; 397 } 398 } 399 if(LOG.isDebugEnabled()) { 400 LOG.debug("NetworkTopology became:\n" + this.toString()); 401 } 402 } finally { 403 netlock.writeLock().unlock(); 404 } 405 } 406 407 /** Check if the tree contains node <i>node</i> 408 * 409 * @param node a node 410 * @return true if <i>node</i> is already in the tree; false otherwise 411 */ 412 public boolean contains(Node node) { 413 if (node == null) return false; 414 netlock.readLock().lock(); 415 try { 416 Node parent = node.getParent(); 417 for (int level = node.getLevel(); parent != null && level > 0; 418 parent = parent.getParent(), level--) { 419 if (parent == clusterMap) { 420 return true; 421 } 422 } 423 } finally { 424 netlock.readLock().unlock(); 425 } 426 return false; 427 } 428 429 /** Given a string representation of a node, return its reference 430 * 431 * @param loc 432 * a path-like string representation of a node 433 * @return a reference to the node; null if the node is not in the tree 434 */ 435 public Node getNode(String loc) { 436 netlock.readLock().lock(); 437 try { 438 loc = NodeBase.normalize(loc); 439 if (!NodeBase.ROOT.equals(loc)) 440 loc = loc.substring(1); 441 return clusterMap.getLoc(loc); 442 } finally { 443 netlock.readLock().unlock(); 444 } 445 } 446 447 /** @return the total number of racks */ 448 public int getNumOfRacks() { 449 netlock.readLock().lock(); 450 try { 451 return numOfRacks; 452 } finally { 453 netlock.readLock().unlock(); 454 } 455 } 456 457 /** @return the total number of leaf nodes */ 458 public int getNumOfLeaves() { 459 netlock.readLock().lock(); 460 try { 461 return clusterMap.getNumOfLeaves(); 462 } finally { 463 netlock.readLock().unlock(); 464 } 465 } 466 467 /** Return the distance between two nodes 468 * It is assumed that the distance from one node to its parent is 1 469 * The distance between two nodes is calculated by summing up their distances 470 * to their closest common ancestor. 471 * @param node1 one node 472 * @param node2 another node 473 * @return the distance between node1 and node2 which is zero if they are the same 474 * or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster 475 */ 476 public int getDistance(Node node1, Node node2) { 477 if (node1 == node2) { 478 return 0; 479 } 480 Node n1=node1, n2=node2; 481 int dis = 0; 482 netlock.readLock().lock(); 483 try { 484 int level1=node1.getLevel(), level2=node2.getLevel(); 485 while(n1!=null && level1>level2) { 486 n1 = n1.getParent(); 487 level1--; 488 dis++; 489 } 490 while(n2!=null && level2>level1) { 491 n2 = n2.getParent(); 492 level2--; 493 dis++; 494 } 495 while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) { 496 n1=n1.getParent(); 497 n2=n2.getParent(); 498 dis+=2; 499 } 500 } finally { 501 netlock.readLock().unlock(); 502 } 503 if (n1==null) { 504 LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1)); 505 return Integer.MAX_VALUE; 506 } 507 if (n2==null) { 508 LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2)); 509 return Integer.MAX_VALUE; 510 } 511 return dis+2; 512 } 513 514 /** Check if two nodes are on the same rack 515 * @param node1 one node (can be null) 516 * @param node2 another node (can be null) 517 * @return true if node1 and node2 are on the same rack; false otherwise 518 * @exception IllegalArgumentException when either node1 or node2 is null, or 519 * node1 or node2 do not belong to the cluster 520 */ 521 public boolean isOnSameRack( Node node1, Node node2) { 522 if (node1 == null || node2 == null) { 523 return false; 524 } 525 526 netlock.readLock().lock(); 527 try { 528 return node1.getParent()==node2.getParent(); 529 } finally { 530 netlock.readLock().unlock(); 531 } 532 } 533 534 final private static Random r = new Random(); 535 /** randomly choose one node from <i>scope</i> 536 * if scope starts with ~, choose one from the all nodes except for the 537 * ones in <i>scope</i>; otherwise, choose one from <i>scope</i> 538 * @param scope range of nodes from which a node will be chosen 539 * @return the chosen node 540 */ 541 public Node chooseRandom(String scope) { 542 netlock.readLock().lock(); 543 try { 544 if (scope.startsWith("~")) { 545 return chooseRandom(NodeBase.ROOT, scope.substring(1)); 546 } else { 547 return chooseRandom(scope, null); 548 } 549 } finally { 550 netlock.readLock().unlock(); 551 } 552 } 553 554 private Node chooseRandom(String scope, String excludedScope){ 555 if (excludedScope != null) { 556 if (scope.startsWith(excludedScope)) { 557 return null; 558 } 559 if (!excludedScope.startsWith(scope)) { 560 excludedScope = null; 561 } 562 } 563 Node node = getNode(scope); 564 if (!(node instanceof InnerNode)) { 565 return node; 566 } 567 InnerNode innerNode = (InnerNode)node; 568 int numOfDatanodes = innerNode.getNumOfLeaves(); 569 if (excludedScope == null) { 570 node = null; 571 } else { 572 node = getNode(excludedScope); 573 if (!(node instanceof InnerNode)) { 574 numOfDatanodes -= 1; 575 } else { 576 numOfDatanodes -= ((InnerNode)node).getNumOfLeaves(); 577 } 578 } 579 int leaveIndex = r.nextInt(numOfDatanodes); 580 return innerNode.getLeaf(leaveIndex, node); 581 } 582 583 /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i> 584 * if scope starts with ~, return the number of nodes that are not 585 * in <i>scope</i> and <i>excludedNodes</i>; 586 * @param scope a path string that may start with ~ 587 * @param excludedNodes a list of nodes 588 * @return number of available nodes 589 */ 590 public int countNumOfAvailableNodes(String scope, 591 Collection<Node> excludedNodes) { 592 boolean isExcluded=false; 593 if (scope.startsWith("~")) { 594 isExcluded=true; 595 scope=scope.substring(1); 596 } 597 scope = NodeBase.normalize(scope); 598 int count=0; // the number of nodes in both scope & excludedNodes 599 netlock.readLock().lock(); 600 try { 601 for(Node node:excludedNodes) { 602 if ((NodeBase.getPath(node)+NodeBase.PATH_SEPARATOR_STR). 603 startsWith(scope+NodeBase.PATH_SEPARATOR_STR)) { 604 count++; 605 } 606 } 607 Node n=getNode(scope); 608 int scopeNodeCount=1; 609 if (n instanceof InnerNode) { 610 scopeNodeCount=((InnerNode)n).getNumOfLeaves(); 611 } 612 if (isExcluded) { 613 return clusterMap.getNumOfLeaves()- 614 scopeNodeCount-excludedNodes.size()+count; 615 } else { 616 return scopeNodeCount-count; 617 } 618 } finally { 619 netlock.readLock().unlock(); 620 } 621 } 622 623 /** convert a network tree to a string */ 624 @Override 625 public String toString() { 626 // print the number of racks 627 StringBuilder tree = new StringBuilder(); 628 tree.append("Number of racks: "); 629 tree.append(numOfRacks); 630 tree.append("\n"); 631 // print the number of leaves 632 int numOfLeaves = getNumOfLeaves(); 633 tree.append("Expected number of leaves:"); 634 tree.append(numOfLeaves); 635 tree.append("\n"); 636 // print nodes 637 for(int i=0; i<numOfLeaves; i++) { 638 tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null))); 639 tree.append("\n"); 640 } 641 return tree.toString(); 642 } 643 644 /* swap two array items */ 645 static private void swap(Node[] nodes, int i, int j) { 646 Node tempNode; 647 tempNode = nodes[j]; 648 nodes[j] = nodes[i]; 649 nodes[i] = tempNode; 650 651 } 652 653 /** Sort nodes array by their distances to <i>reader</i> 654 * It linearly scans the array, if a local node is found, swap it with 655 * the first element of the array. 656 * If a local rack node is found, swap it with the first element following 657 * the local node. 658 * If neither local node or local rack node is found, put a random replica 659 * location at position 0. 660 * It leaves the rest nodes untouched. 661 * @param reader the node that wishes to read a block from one of the nodes 662 * @param nodes the list of nodes containing data for the reader 663 */ 664 public void pseudoSortByDistance( Node reader, Node[] nodes ) { 665 int tempIndex = 0; 666 int localRackNode = -1; 667 if (reader != null ) { 668 //scan the array to find the local node & local rack node 669 for(int i=0; i<nodes.length; i++) { 670 if(tempIndex == 0 && reader == nodes[i]) { //local node 671 //swap the local node and the node at position 0 672 if( i != 0 ) { 673 swap(nodes, tempIndex, i); 674 } 675 tempIndex=1; 676 if(localRackNode != -1 ) { 677 if(localRackNode == 0) { 678 localRackNode = i; 679 } 680 break; 681 } 682 } else if(localRackNode == -1 && isOnSameRack(reader, nodes[i])) { 683 //local rack 684 localRackNode = i; 685 if(tempIndex != 0 ) break; 686 } 687 } 688 689 // swap the local rack node and the node at position tempIndex 690 if(localRackNode != -1 && localRackNode != tempIndex ) { 691 swap(nodes, tempIndex, localRackNode); 692 tempIndex++; 693 } 694 } 695 696 // put a random node at position 0 if it is not a local/local-rack node 697 if(tempIndex == 0 && localRackNode == -1 && nodes.length != 0) { 698 swap(nodes, 0, r.nextInt(nodes.length)); 699 } 700 } 701}