class
MergeJoin[K, V, W, O] extends Iterator[O]
Instance Constructors
-
new
MergeJoin(context: TaskContext, left: Iterator[(K, V)], right: Iterator[(K, W)], joiner: Joiner[K, V, W, O])(implicit ord: Ordering[K])
Value Members
-
-
final
def
!=(arg0: Any): Boolean
-
final
def
##(): Int
-
-
def
/:[B](z: B)(op: (B, O) ⇒ B): B
-
def
:\[B](z: B)(op: (O, B) ⇒ B): B
-
-
final
def
==(arg0: Any): Boolean
-
def
addString(b: StringBuilder): StringBuilder
-
def
addString(b: StringBuilder, sep: String): StringBuilder
-
def
addString(b: StringBuilder, start: String, sep: String, end: String): StringBuilder
-
def
aggregate[B](z: B)(seqop: (B, O) ⇒ B, combop: (B, B) ⇒ B): B
-
final
def
asInstanceOf[T0]: T0
-
-
def
clone(): AnyRef
-
-
-
def
contains(elem: Any): Boolean
-
def
copyToArray[B >: O](xs: Array[B], start: Int, len: Int): Unit
-
def
copyToArray[B >: O](xs: Array[B]): Unit
-
def
copyToArray[B >: O](xs: Array[B], start: Int): Unit
-
def
copyToBuffer[B >: O](dest: Buffer[B]): Unit
-
-
def
count(p: (O) ⇒ Boolean): Int
-
var
currentIterator: Iterator[O]
-
-
-
-
-
-
-
-
-
def
finalize(): Unit
-
-
def
finish: Iterator[O]
-
-
-
def
fold[A1 >: O](z: A1)(op: (A1, A1) ⇒ A1): A1
-
def
foldLeft[B](z: B)(op: (B, O) ⇒ B): B
-
def
foldRight[B](z: B)(op: (O, B) ⇒ B): B
-
-
def
foreach[U](f: (O) ⇒ U): Unit
-
final
def
getClass(): Class[_]
-
-
def
hasDefiniteSize: Boolean
-
-
def
hashCode(): Int
-
def
indexOf[B >: O](elem: B): Int
-
def
indexWhere(p: (O) ⇒ Boolean): Int
-
-
final
def
isInstanceOf[T0]: Boolean
-
def
isTraversableAgain: Boolean
-
-
def
length: Int
-
def
map[B](f: (O) ⇒ B): Iterator[B]
-
def
max[B >: O](implicit cmp: Ordering[B]): O
-
def
maxBy[B](f: (O) ⇒ B)(implicit cmp: Ordering[B]): O
-
def
min[B >: O](implicit cmp: Ordering[B]): O
-
def
minBy[B](f: (O) ⇒ B)(implicit cmp: Ordering[B]): O
-
def
mkString: String
-
def
mkString(sep: String): String
-
def
mkString(start: String, sep: String, end: String): String
-
-
def
next(): O
-
def
nextIterator(): Iterator[O]
-
-
final
def
notify(): Unit
-
final
def
notifyAll(): Unit
-
implicit
val
ord: Ordering[K]
-
def
padTo[A1 >: O](len: Int, elem: A1): Iterator[A1]
-
-
def
patch[B >: O](from: Int, patchElems: Iterator[B], replaced: Int): Iterator[B]
-
def
product[B >: O](implicit num: Numeric[B]): B
-
def
reduce[A1 >: O](op: (A1, A1) ⇒ A1): A1
-
def
reduceLeft[B >: O](op: (B, O) ⇒ B): B
-
def
reduceLeftOption[B >: O](op: (B, O) ⇒ B): Option[B]
-
def
reduceOption[A1 >: O](op: (A1, A1) ⇒ A1): Option[A1]
-
def
reduceRight[B >: O](op: (O, B) ⇒ B): B
-
def
reduceRightOption[B >: O](op: (O, B) ⇒ B): Option[B]
-
def
reversed: List[O]
-
-
-
def
scanLeft[B](z: B)(op: (B, O) ⇒ B): Iterator[B]
-
def
scanRight[B](z: B)(op: (O, B) ⇒ B): Iterator[B]
-
-
def
size: Int
-
def
slice(from: Int, until: Int): Iterator[O]
-
-
-
def
sum[B >: O](implicit num: Numeric[B]): B
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
-
-
def
takeLeftValuesForKey(key: K): Iterator[(K, V)]
-
def
takeRightValuesForKey(key: K): Iterator[(K, W)]
-
-
def
to[Col[_]](implicit cbf: CanBuildFrom[Nothing, O, Col[O]]): Col[O]
-
def
toArray[B >: O](implicit arg0: ClassTag[B]): Array[B]
-
def
toBuffer[B >: O]: Buffer[B]
-
def
toIndexedSeq: IndexedSeq[O]
-
def
toIterable: Iterable[O]
-
def
toIterator: Iterator[O]
-
def
toList: List[O]
-
def
toMap[T, U](implicit ev: <:<[O, (T, U)]): Map[T, U]
-
def
toSeq: Seq[O]
-
def
toSet[B >: O]: Set[B]
-
def
toStream: Stream[O]
-
def
toString(): String
-
-
def
toVector: Vector[O]
-
final
def
wait(): Unit
-
final
def
wait(arg0: Long, arg1: Int): Unit
-
final
def
wait(arg0: Long): Unit
-
-
-
def
zipAll[B, A1 >: O, B1 >: B](that: Iterator[B], thisElem: A1, thatElem: B1): Iterator[(A1, B1)]
-
def
zipWithIndex: Iterator[(O, Int)]
Deprecated Value Members
-
def
/:\[A1 >: O](z: A1)(op: (A1, A1) ⇒ A1): A1
:: DeveloperApi ::
Merge-join implementation that will create a spill-able collection for the right-side to be iterated over for each matching key on the left side. This enables joins that don't require that values for any given key to be required to fit in memory, but it *will* try to buffer as many values as possible using Spark's built-in 'ExternalSorter', and it's a private class so that's why this class is packaged here.
There are numerous optimizations in place to try to minimize the work being done in the join:
1) Since the Joiner returns a value Iterator for each key, we don't need to invoke join logic on each value iteration-- instead we perform join logic for each unique key. This also allows each 'Joiner' to optimize via the methods below based on the join being performed.
2) In cases were we have a key on one side but not the other, we skip creation of the spillable collection and write the output tuples directly according to the Joiner's
leftOuter
/rightOuter
method.3) In cases where there are no values to emit for a particular key, the Joiner can emit an empty Iterator, in which case we will immediately move to the next key without emitting+filtering tuples for those values.
4) In cases where we have a key on both sides, we invoke the Joiner's
inner
method. The default implementation will create a spill-able collection for the right side that will buffer as many values as possible in memory before spilling to disk... so we only pay the penalty for spilling to disk on keys where it is absolutely necessary.