ShuffledRowRDD

This is a specialized version of org.apache.spark.rdd.ShuffledRDD that is optimized for shuffling rows instead of Java key-value pairs. Note that something like this should eventually be implemented in Spark core, but that is blocked by some more general refactorings to shuffle interfaces / internals.

This RDD takes a ShuffleDependency (dependency), and a optional array of partition start indices as input arguments (specifiedPartitionStartIndices).

The dependency has the parent RDD of this RDD, which represents the dataset before shuffle (i.e. map output). Elements of this RDD are (partitionId, Row) pairs. Partition ids should be in the range [0, numPartitions - 1]. dependency.partitioner is the original partitioner used to partition map output, and dependency.partitioner.numPartitions is the number of pre-shuffle partitions (i.e. the number of partitions of the map output).

When specifiedPartitionStartIndices is defined, specifiedPartitionStartIndices.length will be the number of post-shuffle partitions. For this case, the ith post-shuffle partition includes specifiedPartitionStartIndices[i] to specifiedPartitionStartIndices[i+1] - 1 (inclusive).

When specifiedPartitionStartIndices is not defined, there will be dependency.partitioner.numPartitions post-shuffle partitions. For this case, a post-shuffle partition is created for every pre-shuffle partition.

Linear Supertypes

RDD[InternalRow], Logging, Serializable, Serializable, AnyRef, Any

Instance Constructors

new ShuffledRowRDD(dependency: ShuffleDependency[Int, InternalRow, InternalRow], specifiedPartitionStartIndices: Option[Array[Int]] = None)

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
def ++(other: RDD[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def aggregate[U](zeroValue: U)(seqOp: (U, InternalRow) ⇒ U, combOp: (U, U) ⇒ U)(implicit arg0: ClassTag[U]): U

Definition Classes
RDD
final def asInstanceOf[T0]: T0

Definition Classes
Any
def cache(): ShuffledRowRDD.this.type

Definition Classes
RDD
def cartesian[U](other: RDD[U])(implicit arg0: ClassTag[U]): RDD[(InternalRow, U)]

Definition Classes
RDD
def checkpoint(): Unit

Definition Classes
RDD
def clearDependencies(): Unit

Definition Classes
ShuffledRowRDD → RDD
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
def coalesce(numPartitions: Int, shuffle: Boolean)(implicit ord: Ordering[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
def collect[U](f: PartialFunction[InternalRow, U])(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
def collect(): Array[InternalRow]

Definition Classes
RDD
def compute(split: Partition, context: TaskContext): Iterator[InternalRow]

Definition Classes
ShuffledRowRDD → RDD
def context: SparkContext

Definition Classes
RDD
def count(): Long

Definition Classes
RDD
def countApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble]

Definition Classes
RDD
def countApproxDistinct(relativeSD: Double): Long

Definition Classes
RDD
def countApproxDistinct(p: Int, sp: Int): Long

Definition Classes
RDD
def countByValue()(implicit ord: Ordering[InternalRow]): Map[InternalRow, Long]

Definition Classes
RDD
def countByValueApprox(timeout: Long, confidence: Double)(implicit ord: Ordering[InternalRow]): PartialResult[Map[InternalRow, BoundedDouble]]

Definition Classes
RDD
final def dependencies: Seq[Dependency[_]]

Definition Classes
RDD
var dependency: ShuffleDependency[Int, InternalRow, InternalRow]
def distinct(): RDD[InternalRow]

Definition Classes
RDD
def distinct(numPartitions: Int)(implicit ord: Ordering[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def filter(f: (InternalRow) ⇒ Boolean): RDD[InternalRow]

Definition Classes
RDD
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
def first(): InternalRow

Definition Classes
RDD
def firstParent[U](implicit arg0: ClassTag[U]): RDD[U]

Attributes
protected[org.apache.spark]
Definition Classes
RDD
def flatMap[U](f: (InternalRow) ⇒ TraversableOnce[U])(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
def fold(zeroValue: InternalRow)(op: (InternalRow, InternalRow) ⇒ InternalRow): InternalRow

Definition Classes
RDD
def foreach(f: (InternalRow) ⇒ Unit): Unit

Definition Classes
RDD
def foreachPartition(f: (Iterator[InternalRow]) ⇒ Unit): Unit

Definition Classes
RDD
def getCheckpointFile: Option[String]

Definition Classes
RDD
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def getDependencies: Seq[Dependency[_]]

Definition Classes
ShuffledRowRDD → RDD
final def getNumPartitions: Int

Definition Classes
RDD
Annotations
@Since( "1.6.0" )
def getPartitions: Array[Partition]

Definition Classes
ShuffledRowRDD → RDD
def getPreferredLocations(partition: Partition): Seq[String]

Definition Classes
ShuffledRowRDD → RDD
def getStorageLevel: StorageLevel

Definition Classes
RDD
def glom(): RDD[Array[InternalRow]]

Definition Classes
RDD
def groupBy[K](f: (InternalRow) ⇒ K, p: Partitioner)(implicit kt: ClassTag[K], ord: Ordering[K]): RDD[(K, Iterable[InternalRow])]

Definition Classes
RDD
def groupBy[K](f: (InternalRow) ⇒ K, numPartitions: Int)(implicit kt: ClassTag[K]): RDD[(K, Iterable[InternalRow])]

Definition Classes
RDD
def groupBy[K](f: (InternalRow) ⇒ K)(implicit kt: ClassTag[K]): RDD[(K, Iterable[InternalRow])]

Definition Classes
RDD
def hashCode(): Int

Definition Classes
AnyRef → Any
val id: Int

Definition Classes
RDD
def intersection(other: RDD[InternalRow], numPartitions: Int): RDD[InternalRow]

Definition Classes
RDD
def intersection(other: RDD[InternalRow], partitioner: Partitioner)(implicit ord: Ordering[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
def intersection(other: RDD[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
def isCheckpointed: Boolean

Definition Classes
RDD
def isEmpty(): Boolean

Definition Classes
RDD
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
def isTraceEnabled(): Boolean

Attributes
protected
Definition Classes
Logging
final def iterator(split: Partition, context: TaskContext): Iterator[InternalRow]

Definition Classes
RDD
def keyBy[K](f: (InternalRow) ⇒ K): RDD[(K, InternalRow)]

Definition Classes
RDD
def localCheckpoint(): ShuffledRowRDD.this.type

Definition Classes
RDD
def log: Logger

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logName: String

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def map[U](f: (InternalRow) ⇒ U)(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
def mapPartitions[U](f: (Iterator[InternalRow]) ⇒ Iterator[U], preservesPartitioning: Boolean)(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
def mapPartitionsWithIndex[U](f: (Int, Iterator[InternalRow]) ⇒ Iterator[U], preservesPartitioning: Boolean)(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
def max()(implicit ord: Ordering[InternalRow]): InternalRow

Definition Classes
RDD
def min()(implicit ord: Ordering[InternalRow]): InternalRow

Definition Classes
RDD
var name: String

Definition Classes
RDD
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
def parent[U](j: Int)(implicit arg0: ClassTag[U]): RDD[U]

Attributes
protected[org.apache.spark]
Definition Classes
RDD
val partitioner: Option[Partitioner]

Definition Classes
ShuffledRowRDD → RDD
final def partitions: Array[Partition]

Definition Classes
RDD
def persist(): ShuffledRowRDD.this.type

Definition Classes
RDD
def persist(newLevel: StorageLevel): ShuffledRowRDD.this.type

Definition Classes
RDD
def pipe(command: Seq[String], env: Map[String, String], printPipeContext: ((String) ⇒ Unit) ⇒ Unit, printRDDElement: (InternalRow, (String) ⇒ Unit) ⇒ Unit, separateWorkingDir: Boolean): RDD[String]

Definition Classes
RDD
def pipe(command: String, env: Map[String, String]): RDD[String]

Definition Classes
RDD
def pipe(command: String): RDD[String]

Definition Classes
RDD
final def preferredLocations(split: Partition): Seq[String]

Definition Classes
RDD
def randomSplit(weights: Array[Double], seed: Long): Array[RDD[InternalRow]]

Definition Classes
RDD
def reduce(f: (InternalRow, InternalRow) ⇒ InternalRow): InternalRow

Definition Classes
RDD
def repartition(numPartitions: Int)(implicit ord: Ordering[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
def sample(withReplacement: Boolean, fraction: Double, seed: Long): RDD[InternalRow]

Definition Classes
RDD
def saveAsObjectFile(path: String): Unit

Definition Classes
RDD
def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]): Unit

Definition Classes
RDD
def saveAsTextFile(path: String): Unit

Definition Classes
RDD
def setName(_name: String): ShuffledRowRDD.this.type

Definition Classes
RDD
def sortBy[K](f: (InternalRow) ⇒ K, ascending: Boolean, numPartitions: Int)(implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[InternalRow]

Definition Classes
RDD
def sparkContext: SparkContext

Definition Classes
RDD
def subtract(other: RDD[InternalRow], p: Partitioner)(implicit ord: Ordering[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
def subtract(other: RDD[InternalRow], numPartitions: Int): RDD[InternalRow]

Definition Classes
RDD
def subtract(other: RDD[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def take(num: Int): Array[InternalRow]

Definition Classes
RDD
def takeOrdered(num: Int)(implicit ord: Ordering[InternalRow]): Array[InternalRow]

Definition Classes
RDD
def takeSample(withReplacement: Boolean, num: Int, seed: Long): Array[InternalRow]

Definition Classes
RDD
def toDebugString: String

Definition Classes
RDD
def toJavaRDD(): JavaRDD[InternalRow]

Definition Classes
RDD
def toLocalIterator: Iterator[InternalRow]

Definition Classes
RDD
def toString(): String

Definition Classes
RDD → AnyRef → Any
def top(num: Int)(implicit ord: Ordering[InternalRow]): Array[InternalRow]

Definition Classes
RDD
def treeAggregate[U](zeroValue: U)(seqOp: (U, InternalRow) ⇒ U, combOp: (U, U) ⇒ U, depth: Int)(implicit arg0: ClassTag[U]): U

Definition Classes
RDD
def treeReduce(f: (InternalRow, InternalRow) ⇒ InternalRow, depth: Int): InternalRow

Definition Classes
RDD
def union(other: RDD[InternalRow]): RDD[InternalRow]

Definition Classes
RDD
def unpersist(blocking: Boolean): ShuffledRowRDD.this.type

Definition Classes
RDD
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
def zip[U](other: RDD[U])(implicit arg0: ClassTag[U]): RDD[(InternalRow, U)]

Definition Classes
RDD
def zipPartitions[B, C, D, V](rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D])(f: (Iterator[InternalRow], Iterator[B], Iterator[C], Iterator[D]) ⇒ Iterator[V])(implicit arg0: ClassTag[B], arg1: ClassTag[C], arg2: ClassTag[D], arg3: ClassTag[V]): RDD[V]

Definition Classes
RDD
def zipPartitions[B, C, D, V](rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D], preservesPartitioning: Boolean)(f: (Iterator[InternalRow], Iterator[B], Iterator[C], Iterator[D]) ⇒ Iterator[V])(implicit arg0: ClassTag[B], arg1: ClassTag[C], arg2: ClassTag[D], arg3: ClassTag[V]): RDD[V]

Definition Classes
RDD
def zipPartitions[B, C, V](rdd2: RDD[B], rdd3: RDD[C])(f: (Iterator[InternalRow], Iterator[B], Iterator[C]) ⇒ Iterator[V])(implicit arg0: ClassTag[B], arg1: ClassTag[C], arg2: ClassTag[V]): RDD[V]

Definition Classes
RDD
def zipPartitions[B, C, V](rdd2: RDD[B], rdd3: RDD[C], preservesPartitioning: Boolean)(f: (Iterator[InternalRow], Iterator[B], Iterator[C]) ⇒ Iterator[V])(implicit arg0: ClassTag[B], arg1: ClassTag[C], arg2: ClassTag[V]): RDD[V]

Definition Classes
RDD
def zipPartitions[B, V](rdd2: RDD[B])(f: (Iterator[InternalRow], Iterator[B]) ⇒ Iterator[V])(implicit arg0: ClassTag[B], arg1: ClassTag[V]): RDD[V]

Definition Classes
RDD
def zipPartitions[B, V](rdd2: RDD[B], preservesPartitioning: Boolean)(f: (Iterator[InternalRow], Iterator[B]) ⇒ Iterator[V])(implicit arg0: ClassTag[B], arg1: ClassTag[V]): RDD[V]

Definition Classes
RDD
def zipWithIndex(): RDD[(InternalRow, Long)]

Definition Classes
RDD
def zipWithUniqueId(): RDD[(InternalRow, Long)]

Definition Classes
RDD

Deprecated Value Members

def filterWith[A](constructA: (Int) ⇒ A)(p: (InternalRow, A) ⇒ Boolean): RDD[InternalRow]

Definition Classes
RDD
Annotations
@deprecated
Deprecated
(Since version 1.0.0) use mapPartitionsWithIndex and filter
def flatMapWith[A, U](constructA: (Int) ⇒ A, preservesPartitioning: Boolean)(f: (InternalRow, A) ⇒ Seq[U])(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
Annotations
@deprecated
Deprecated
(Since version 1.0.0) use mapPartitionsWithIndex and flatMap
def foreachWith[A](constructA: (Int) ⇒ A)(f: (InternalRow, A) ⇒ Unit): Unit

Definition Classes
RDD
Annotations
@deprecated
Deprecated
(Since version 1.0.0) use mapPartitionsWithIndex and foreach
def mapPartitionsWithContext[U](f: (TaskContext, Iterator[InternalRow]) ⇒ Iterator[U], preservesPartitioning: Boolean)(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
Annotations
@DeveloperApi() @deprecated
Deprecated
(Since version 1.2.0) use TaskContext.get
def mapPartitionsWithSplit[U](f: (Int, Iterator[InternalRow]) ⇒ Iterator[U], preservesPartitioning: Boolean)(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
Annotations
@deprecated
Deprecated
(Since version 0.7.0) use mapPartitionsWithIndex
def mapWith[A, U](constructA: (Int) ⇒ A, preservesPartitioning: Boolean)(f: (InternalRow, A) ⇒ U)(implicit arg0: ClassTag[U]): RDD[U]

Definition Classes
RDD
Annotations
@deprecated
Deprecated
(Since version 1.0.0) use mapPartitionsWithIndex
def toArray(): Array[InternalRow]

Definition Classes
RDD
Annotations
@deprecated
Deprecated
(Since version 1.0.0) use collect

Related Doc: package execution

class ShuffledRowRDD extends RDD[InternalRow]

Instance Constructors

new ShuffledRowRDD(dependency: ShuffleDependency[Int, InternalRow, InternalRow], specifiedPartitionStartIndices: Option[Array[Int]] = None)

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

def ++(other: RDD[InternalRow]): RDD[InternalRow]

final def ==(arg0: Any): Boolean

def aggregate[U](zeroValue: U)(seqOp: (U, InternalRow) ⇒ U, combOp: (U, U) ⇒ U)(implicit arg0: ClassTag[U]): U

final def asInstanceOf[T0]: T0

def cache(): ShuffledRowRDD.this.type

def cartesian[U](other: RDD[U])(implicit arg0: ClassTag[U]): RDD[(InternalRow, U)]

def checkpoint(): Unit

def clearDependencies(): Unit

def clone(): AnyRef

def coalesce(numPartitions: Int, shuffle: Boolean)(implicit ord: Ordering[InternalRow]): RDD[InternalRow]

def collect[U](f: PartialFunction[InternalRow, U])(implicit arg0: ClassTag[U]): RDD[U]

def collect(): Array[InternalRow]

def compute(split: Partition, context: TaskContext): Iterator[InternalRow]

def context: SparkContext

def count(): Long

def countApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble]

def countApproxDistinct(relativeSD: Double): Long

def countApproxDistinct(p: Int, sp: Int): Long

def countByValue()(implicit ord: Ordering[InternalRow]): Map[InternalRow, Long]

def countByValueApprox(timeout: Long, confidence: Double)(implicit ord: Ordering[InternalRow]): PartialResult[Map[InternalRow, BoundedDouble]]

final def dependencies: Seq[Dependency[_]]

var dependency: ShuffleDependency[Int, InternalRow, InternalRow]

def distinct(): RDD[InternalRow]

def distinct(numPartitions: Int)(implicit ord: Ordering[InternalRow]): RDD[InternalRow]

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def filter(f: (InternalRow) ⇒ Boolean): RDD[InternalRow]

def finalize(): Unit

def first(): InternalRow

def firstParent[U](implicit arg0: ClassTag[U]): RDD[U]

def flatMap[U](f: (InternalRow) ⇒ TraversableOnce[U])(implicit arg0: ClassTag[U]): RDD[U]

def fold(zeroValue: InternalRow)(op: (InternalRow, InternalRow) ⇒ InternalRow): InternalRow

def foreach(f: (InternalRow) ⇒ Unit): Unit

def foreachPartition(f: (Iterator[InternalRow]) ⇒ Unit): Unit

def getCheckpointFile: Option[String]

final def getClass(): Class[_]

def getDependencies: Seq[Dependency[_]]

final def getNumPartitions: Int

def getPartitions: Array[Partition]

def getPreferredLocations(partition: Partition): Seq[String]

def getStorageLevel: StorageLevel

def glom(): RDD[Array[InternalRow]]

def groupBy[K](f: (InternalRow) ⇒ K, p: Partitioner)(implicit kt: ClassTag[K], ord: Ordering[K]): RDD[(K, Iterable[InternalRow])]

def groupBy[K](f: (InternalRow) ⇒ K, numPartitions: Int)(implicit kt: ClassTag[K]): RDD[(K, Iterable[InternalRow])]

def groupBy[K](f: (InternalRow) ⇒ K)(implicit kt: ClassTag[K]): RDD[(K, Iterable[InternalRow])]

def hashCode(): Int

val id: Int

def intersection(other: RDD[InternalRow], numPartitions: Int): RDD[InternalRow]

def intersection(other: RDD[InternalRow], partitioner: Partitioner)(implicit ord: Ordering[InternalRow]): RDD[InternalRow]

def intersection(other: RDD[InternalRow]): RDD[InternalRow]

def isCheckpointed: Boolean

def isEmpty(): Boolean

final def isInstanceOf[T0]: Boolean

def isTraceEnabled(): Boolean

final def iterator(split: Partition, context: TaskContext): Iterator[InternalRow]

def keyBy[K](f: (InternalRow) ⇒ K): RDD[(K, InternalRow)]

def localCheckpoint(): ShuffledRowRDD.this.type

def log: Logger

def logDebug(msg: ⇒ String, throwable: Throwable): Unit

def logDebug(msg: ⇒ String): Unit

def logError(msg: ⇒ String, throwable: Throwable): Unit

def logError(msg: ⇒ String): Unit

def logInfo(msg: ⇒ String, throwable: Throwable): Unit

def logInfo(msg: ⇒ String): Unit

def logName: String

def logTrace(msg: ⇒ String, throwable: Throwable): Unit

def logTrace(msg: ⇒ String): Unit

def logWarning(msg: ⇒ String, throwable: Throwable): Unit

def logWarning(msg: ⇒ String): Unit

def map[U](f: (InternalRow) ⇒ U)(implicit arg0: ClassTag[U]): RDD[U]

def mapPartitions[U](f: (Iterator[InternalRow]) ⇒ Iterator[U], preservesPartitioning: Boolean)(implicit arg0: ClassTag[U]): RDD[U]

def mapPartitionsWithIndex[U](f: (Int, Iterator[InternalRow]) ⇒ Iterator[U], preservesPartitioning: Boolean)(implicit arg0: ClassTag[U]): RDD[U]