SplitOperators

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def chronologicalSplit(data: DataFrame, seed: Long, trainSplitChronologicalColumn: String, trainSplitChronologicalRandomPercentage: Double, trainPortion: Double): Array[DataFrame]
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
def genTestTrain(data: DataFrame, seed: Long, uniqueLabels: Array[Row], trainSplitMethod: String, labelCol: String, trainPortion: Double, syntheticCol: String = "syntheticColumn", trainSplitChronologicalColumn: String = "datetime", trainSplitChronologicalRandomPercentage: Double = 0.05, reductionFactor: Double = 0.5): Array[DataFrame]
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def hashCode(): Int

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
def kSamplingSplit(data: DataFrame, seed: Long, uniqueLabels: Array[Row], syntheticCol: String, labelCol: String, trainPortion: Double): Array[DataFrame]

Split methodology for getting test and train of KSample up-sampled data.
Both data sets are split into test and train.
Split methodology for getting test and train of KSample up-sampled data.
Both data sets are split into test and train.
The returned collections are a union of the real train + synthetic train, but only the real test data.
data
DataFrame: The full data set (containing a synthetic column that indicates whether the data is real or not)
seed
Long: A seed value that is consistent across both data sets
uniqueLabels
Array[Row]: The unique entries of the label values
returns
Array[DataFrame] of Array(trainData, testData)

Since
0.5.1
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
def optimizeTestTrain(train: DataFrame, test: DataFrame, optimalParts: Int, shuffle: Boolean = false): (DataFrame, DataFrame)
def overSampleSplit(data: DataFrame, seed: Long, labelCol: String, trainPortion: Double): Array[DataFrame]
lazy val sc: SparkContext

Definition Classes
SparkSessionWrapper
lazy val spark: SparkSession

Definition Classes
SparkSessionWrapper
def stratifiedSplit(data: DataFrame, seed: Long, uniqueLabels: Array[Row], labelCol: String, trainPortion: Double): Array[DataFrame]

Method for stratification of the test/train around the unique values of the label column This mode is recommended for label value distributions in classification that have relatively balanced and uniformly distributed instances of the classes.
Method for stratification of the test/train around the unique values of the label column This mode is recommended for label value distributions in classification that have relatively balanced and uniformly distributed instances of the classes. If there is significant skew, it is highly recommended to use under or over sampling.
data
Dataframe that is the input to the train/test split
seed
random seed for splitting the data into train/test.
returns
An Array of Dataframes: Array[<trainData>, <testData>]
def stratifyReduce(data: DataFrame, reductionFactor: Double, seed: Long, uniqueLabels: Array[Row], labelCol: String, trainPortion: Double): Array[DataFrame]
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
def underSampleSplit(data: DataFrame, seed: Long, labelCol: String, trainPortion: Double): Array[DataFrame]
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Doc: package split

object SplitOperators extends SparkSessionWrapper

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

def chronologicalSplit(data: DataFrame, seed: Long, trainSplitChronologicalColumn: String, trainSplitChronologicalRandomPercentage: Double, trainPortion: Double): Array[DataFrame]

def clone(): AnyRef

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

final def getClass(): Class[_]

def hashCode(): Int

final def isInstanceOf[T0]: Boolean

def kSamplingSplit(data: DataFrame, seed: Long, uniqueLabels: Array[Row], syntheticCol: String, labelCol: String, trainPortion: Double): Array[DataFrame]

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

def optimizeTestTrain(train: DataFrame, test: DataFrame, optimalParts: Int, shuffle: Boolean = false): (DataFrame, DataFrame)

def overSampleSplit(data: DataFrame, seed: Long, labelCol: String, trainPortion: Double): Array[DataFrame]

lazy val sc: SparkContext

lazy val spark: SparkSession

def stratifiedSplit(data: DataFrame, seed: Long, uniqueLabels: Array[Row], labelCol: String, trainPortion: Double): Array[DataFrame]

def stratifyReduce(data: DataFrame, reductionFactor: Double, seed: Long, uniqueLabels: Array[Row], labelCol: String, trainPortion: Double): Array[DataFrame]

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

def underSampleSplit(data: DataFrame, seed: Long, labelCol: String, trainPortion: Double): Array[DataFrame]

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from SparkSessionWrapper

Inherited from Serializable

Inherited from Serializable

Inherited from AnyRef

Inherited from Any

Ungrouped