BinaryEncoder

Instance Constructors

new BinaryEncoder()
new BinaryEncoder(uid: String)

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def $[T](param: Param[T]): T

Attributes
protected
Definition Classes
Params
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
final def clear(param: Param[_]): BinaryEncoder.this.type

Definition Classes
Params
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
def copy(extra: ParamMap): BinaryEncoder

Definition Classes
BinaryEncoder → Estimator → PipelineStage → Params
def copyValues[T <: Params](to: T, extra: ParamMap): T

Attributes
protected
Definition Classes
Params
final def defaultCopy[T <: Params](extra: ParamMap): T

Attributes
protected
Definition Classes
Params
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def explainParam(param: Param[_]): String

Definition Classes
Params
def explainParams(): String

Definition Classes
Params
final def extractParamMap(): ParamMap

Definition Classes
Params
final def extractParamMap(extra: ParamMap): ParamMap

Definition Classes
Params
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
def fit(dataset: Dataset[_]): BinaryEncoderModel

Main fit method that will build a BinaryEncoder model from the data set and the configured input and output columns specified in the setters.
Main fit method that will build a BinaryEncoder model from the data set and the configured input and output columns specified in the setters. The primary principle at work here is dimensionality reduction for the encoding of extremely high-cardinality StringIndexed columns. OneHotEncoding works extremely well for this purpose, but has the side-effect of requiring extremely large amounts of columns to be generated when performing OHE is increased memory pressure. This package allows for a lossy reduction in this space by distilling the information into a binary string encoding space that is dynamic based on the encoded length of the maximum nominal space as represented in binary
dataset
The dataset (or DataFrame) used in training the model
returns
BinaryEncoderModel - a serializable artifact that has the output schema and encoding embedded within it.
Definition Classes
BinaryEncoder → Estimator
Example:
1. e.g. if the cardinality of a nominal column is 113, the binary representation of that is 1110001. When using OHE, this would result in 113 (or 114 if allowing invalids) binary positions within a sparse vector, creating 113 or 114 columns in the dataset. However, using BinaryEncoder, we are left with 7 (or 8, if allowing invalids) dense vector positions to capture the same amount of information.
Since
0.5.3
Note
Due to the nature of this encoding and how the majority of models learn, this is seen as an information loss encoding. However, considering that high cardinality non-numeric nominal fields are frequently discarded due to the explosion of the data set, this is providing the ability to utilize high cardinality fields that otherwise would not be able to be included.
def fit(dataset: Dataset[_], paramMaps: Array[ParamMap]): Seq[BinaryEncoderModel]

Definition Classes
Estimator
Annotations
@Since( "2.0.0" )
def fit(dataset: Dataset[_], paramMap: ParamMap): BinaryEncoderModel

Definition Classes
Estimator
Annotations
@Since( "2.0.0" )
def fit(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): BinaryEncoderModel

Definition Classes
Estimator
Annotations
@Since( "2.0.0" ) @varargs()
final def get[T](param: Param[T]): Option[T]

Definition Classes
Params
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
final def getDefault[T](param: Param[T]): Option[T]

Definition Classes
Params
final def getHandleInvalid: String

Definition Classes
HasHandleInvalid
final def getInputCols: Array[String]

Definition Classes
HasInputCols
final def getOrDefault[T](param: Param[T]): T

Definition Classes
Params
final def getOutputCols: Array[String]

Definition Classes
HasOutputCols
def getParam(paramName: String): Param[Any]

Definition Classes
Params
val handleInvalid: Param[String]

Configuration of the Parameter for handling invalid entries in a previously modeled feature column.
Configuration of the Parameter for handling invalid entries in a previously modeled feature column.

Definition Classes
BinaryEncoderBase → HasHandleInvalid
final def hasDefault[T](param: Param[T]): Boolean

Definition Classes
Params
def hasParam(paramName: String): Boolean

Definition Classes
Params
def hashCode(): Int

Definition Classes
AnyRef → Any
def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean

Attributes
protected
Definition Classes
Logging
def initializeLogIfNecessary(isInterpreter: Boolean): Unit

Attributes
protected
Definition Classes
Logging
final val inputCols: StringArrayParam

Definition Classes
HasInputCols
final def isDefined(param: Param[_]): Boolean

Definition Classes
Params
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def isSet(param: Param[_]): Boolean

Definition Classes
Params
def isTraceEnabled(): Boolean

Attributes
protected
Definition Classes
Logging
def log: Logger

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logName: String

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
final val outputCols: StringArrayParam

Definition Classes
HasOutputCols
lazy val params: Array[Param[_]]

Definition Classes
Params
def save(path: String): Unit

Definition Classes
MLWritable
Annotations
@Since( "1.6.0" ) @throws( ... )
final def set(paramPair: ParamPair[_]): BinaryEncoder.this.type

Attributes
protected
Definition Classes
Params
final def set(param: String, value: Any): BinaryEncoder.this.type

Attributes
protected
Definition Classes
Params
final def set[T](param: Param[T], value: T): BinaryEncoder.this.type

Definition Classes
Params
final def setDefault(paramPairs: ParamPair[_]*): BinaryEncoder.this.type

Attributes
protected
Definition Classes
Params
final def setDefault[T](param: Param[T], value: T): BinaryEncoder.this.type

Attributes
protected
Definition Classes
Params
def setHandleInvalid(value: String): BinaryEncoder.this.type

Setter for supplying an optional 'keep' or 'error' (Default: 'error') for un-seen values that arrive into a pre-trained model.
Setter for supplying an optional 'keep' or 'error' (Default: 'error') for un-seen values that arrive into a pre-trained model. With the 'keep' setting, an additional vector position is added to the output column to ensure no collisions may exist with real data and the values throughout each of the Array[Double] locations in the DenseVector output will all be set to '1'
value
String: either 'keep' or 'error' (Default: 'error')

Annotations
@throws( classOf[SparkException] )
Since
0.5.3
Exceptions thrown
SparkException if the configuration value supplied is not either 'keep' or 'error'
def setInputCols(values: Array[String]): BinaryEncoder.this.type

Setter for supplying the array of input columns to be encoded with the BinaryEncoder type
Setter for supplying the array of input columns to be encoded with the BinaryEncoder type
values
Array of column names

Since
0.5.3
def setOutputCols(values: Array[String]): BinaryEncoder.this.type

Setter for supplying the array of output columns that are the result of running a .transform from a trained model on an appropriate dataset of compatible schema
Setter for supplying the array of output columns that are the result of running a .transform from a trained model on an appropriate dataset of compatible schema
values
Array of column names that will be generated through a .transform

Since
0.5.3
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
Identifiable → AnyRef → Any
def transformSchema(schema: StructType): StructType

Definition Classes
BinaryEncoder → PipelineStage
def transformSchema(schema: StructType, logging: Boolean): StructType

Attributes
protected
Definition Classes
PipelineStage
Annotations
@DeveloperApi()
val uid: String

Definition Classes
BinaryEncoder → Identifiable
def validateAndTransformSchema(schema: StructType, keepInvalid: Boolean): StructType

Method for validating the resultant schema from the application of building and transforming using this encoder package.
Method for validating the resultant schema from the application of building and transforming using this encoder package. The purpose of validation is to ensure that the supplied input columns are of the correct binary or nominal (ordinal numeric) type and that the output columns will contain the correct number of columns based on the configuration set.
schema
The schema of the dataset supplied for training of the model or used in transforming using the model
keepInvalid
Boolean flag for whether to allow for an additional binary encoding value to be used for any values that were unknown at the time of model training, which will summarily be converted to a 'max binary value' of the encoding length + 1 with maximum n * "1" values.
returns
StructType that represents the transformed schema with additional output columns appended to the dataset structure.

Attributes
protected
Definition Classes
BinaryEncoderBase
Annotations
@throws( ... )
Since
0.5.3
Exceptions thrown
UnsupportedOperationException if the configured input cols and output cols do not match one another in length.
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
def write: MLWriter

Definition Classes
DefaultParamsWritable → MLWritable

Related Docs: object BinaryEncoder | package feature

class BinaryEncoder extends Estimator[BinaryEncoderModel] with DefaultParamsWritable with HasInputCols with HasOutputCols with BinaryEncoderBase

Instance Constructors

new BinaryEncoder()

new BinaryEncoder(uid: String)

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def $[T](param: Param[T]): T

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

final def clear(param: Param[_]): BinaryEncoder.this.type

def clone(): AnyRef

def copy(extra: ParamMap): BinaryEncoder

def copyValues[T <: Params](to: T, extra: ParamMap): T

final def defaultCopy[T <: Params](extra: ParamMap): T

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def explainParam(param: Param[_]): String

def explainParams(): String

final def extractParamMap(): ParamMap

final def extractParamMap(extra: ParamMap): ParamMap

def finalize(): Unit

def fit(dataset: Dataset[_]): BinaryEncoderModel

def fit(dataset: Dataset[_], paramMaps: Array[ParamMap]): Seq[BinaryEncoderModel]

def fit(dataset: Dataset[_], paramMap: ParamMap): BinaryEncoderModel

def fit(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): BinaryEncoderModel

final def get[T](param: Param[T]): Option[T]

final def getClass(): Class[_]

final def getDefault[T](param: Param[T]): Option[T]

final def getHandleInvalid: String

final def getInputCols: Array[String]

final def getOrDefault[T](param: Param[T]): T

final def getOutputCols: Array[String]

def getParam(paramName: String): Param[Any]

val handleInvalid: Param[String]

final def hasDefault[T](param: Param[T]): Boolean

def hasParam(paramName: String): Boolean

def hashCode(): Int

def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean

def initializeLogIfNecessary(isInterpreter: Boolean): Unit

final val inputCols: StringArrayParam

final def isDefined(param: Param[_]): Boolean

final def isInstanceOf[T0]: Boolean

final def isSet(param: Param[_]): Boolean

def isTraceEnabled(): Boolean

def log: Logger

def logDebug(msg: ⇒ String, throwable: Throwable): Unit

def logDebug(msg: ⇒ String): Unit

def logError(msg: ⇒ String, throwable: Throwable): Unit

def logError(msg: ⇒ String): Unit

def logInfo(msg: ⇒ String, throwable: Throwable): Unit

def logInfo(msg: ⇒ String): Unit

def logName: String

def logTrace(msg: ⇒ String, throwable: Throwable): Unit

def logTrace(msg: ⇒ String): Unit

def logWarning(msg: ⇒ String, throwable: Throwable): Unit

def logWarning(msg: ⇒ String): Unit

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

final val outputCols: StringArrayParam

lazy val params: Array[Param[_]]

def save(path: String): Unit

final def set(paramPair: ParamPair[_]): BinaryEncoder.this.type

final def set(param: String, value: Any): BinaryEncoder.this.type

final def set[T](param: Param[T], value: T): BinaryEncoder.this.type

final def setDefault(paramPairs: ParamPair[_]*): BinaryEncoder.this.type

final def setDefault[T](param: Param[T], value: T): BinaryEncoder.this.type

def setHandleInvalid(value: String): BinaryEncoder.this.type

def setInputCols(values: Array[String]): BinaryEncoder.this.type

def setOutputCols(values: Array[String]): BinaryEncoder.this.type

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

def transformSchema(schema: StructType): StructType

def transformSchema(schema: StructType, logging: Boolean): StructType

val uid: String

def validateAndTransformSchema(schema: StructType, keepInvalid: Boolean): StructType

final def wait(): Unit