class DataFrameOptimizer extends Transformer with DefaultParamsWritable
Optimizes a org.apache.spark.sql.DataFrame by adjusting the number of partitions, optionally caching it, and optionally persisting it to disk.
This transformer is useful when fine-tuning performance in a Spark NLP pipeline or preparing data for export or downstream tasks. It provides options to control the number of partitions directly, or to calculate them using the number of executor cores and workers. Additionally, it can persist the DataFrame in CSV, JSON, or Parquet format with configurable writer options.
Parameters
executorCores(Int): Number of cores per executor, used to calculate partitions.numWorkers(Int): Total number of executor nodes.numPartitions(Int): Target number of partitions. Overrides the computed value from cores × workers.doCache(Boolean): Whether to cache the transformed DataFrame.persistPath(String): Optional path to write the output DataFrame.persistFormat(String): File format for persistence. Supported:csv,json,parquet.outputOptions(Map[String, String]): Extra options passed to the DataFrameWriter.
Example
val optimizer = new DataFrameOptimizer() .setExecutorCores(4) .setNumWorkers(5) .setDoCache(true) .setPersistPath("/tmp/output") .setPersistFormat("parquet") .setOutputOptions(Map("compression" -> "snappy")) val optimizedDF = optimizer.transform(inputDF)
This transformer does not modify the schema of the DataFrame.
Linear Supertypes
Ordering
- Alphabetic
- By Inheritance
Inherited
- DataFrameOptimizer
- DefaultParamsWritable
- MLWritable
- Transformer
- PipelineStage
- Logging
- Params
- Serializable
- Identifiable
- AnyRef
- Any
- Hide All
- Show All
Visibility
- Public
- Protected
Value Members
- final def !=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- final def ##: Int
- Definition Classes
- AnyRef → Any
- final def $[T](param: Param[T]): T
- Attributes
- protected
- Definition Classes
- Params
- final def ==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- final def asInstanceOf[T0]: T0
- Definition Classes
- Any
- final def clear(param: Param[_]): DataFrameOptimizer.this.type
- Definition Classes
- Params
- def clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.CloneNotSupportedException]) @HotSpotIntrinsicCandidate() @native()
- def copy(extra: ParamMap): Transformer
- Definition Classes
- DataFrameOptimizer → Transformer → PipelineStage → Params
- def copyValues[T <: Params](to: T, extra: ParamMap): T
- Attributes
- protected
- Definition Classes
- Params
- final def defaultCopy[T <: Params](extra: ParamMap): T
- Attributes
- protected
- Definition Classes
- Params
- val doCache: BooleanParam
- final def eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- def equals(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef → Any
- val executorCores: IntParam
- def explainParam(param: Param[_]): String
- Definition Classes
- Params
- def explainParams(): String
- Definition Classes
- Params
- final def extractParamMap(): ParamMap
- Definition Classes
- Params
- final def extractParamMap(extra: ParamMap): ParamMap
- Definition Classes
- Params
- final def get[T](param: Param[T]): Option[T]
- Definition Classes
- Params
- final def getClass(): Class[_ <: AnyRef]
- Definition Classes
- AnyRef → Any
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- final def getDefault[T](param: Param[T]): Option[T]
- Definition Classes
- Params
- final def getOrDefault[T](param: Param[T]): T
- Definition Classes
- Params
- def getParam(paramName: String): Param[Any]
- Definition Classes
- Params
- final def hasDefault[T](param: Param[T]): Boolean
- Definition Classes
- Params
- def hasParam(paramName: String): Boolean
- Definition Classes
- Params
- def hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean
- Attributes
- protected
- Definition Classes
- Logging
- def initializeLogIfNecessary(isInterpreter: Boolean): Unit
- Attributes
- protected
- Definition Classes
- Logging
- final def isDefined(param: Param[_]): Boolean
- Definition Classes
- Params
- final def isInstanceOf[T0]: Boolean
- Definition Classes
- Any
- final def isSet(param: Param[_]): Boolean
- Definition Classes
- Params
- def isTraceEnabled(): Boolean
- Attributes
- protected
- Definition Classes
- Logging
- def log: Logger
- Attributes
- protected
- Definition Classes
- Logging
- def logDebug(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logDebug(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logError(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logError(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logInfo(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logInfo(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logName: String
- Attributes
- protected
- Definition Classes
- Logging
- def logTrace(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logTrace(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logWarning(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logWarning(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- final def ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- final def notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- final def notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- val numPartitions: IntParam
- val numWorkers: IntParam
- val outputOptions: Param[Map[String, String]]
- lazy val params: Array[Param[_]]
- Definition Classes
- Params
- val persistFormat: Param[String]
- val persistPath: Param[String]
- def save(path: String): Unit
- Definition Classes
- MLWritable
- Annotations
- @throws("If the input path already exists but overwrite is not enabled.") @Since("1.6.0")
- final def set(paramPair: ParamPair[_]): DataFrameOptimizer.this.type
- Attributes
- protected
- Definition Classes
- Params
- final def set(param: String, value: Any): DataFrameOptimizer.this.type
- Attributes
- protected
- Definition Classes
- Params
- final def set[T](param: Param[T], value: T): DataFrameOptimizer.this.type
- Definition Classes
- Params
- final def setDefault(paramPairs: ParamPair[_]*): DataFrameOptimizer.this.type
- Attributes
- protected
- Definition Classes
- Params
- final def setDefault[T](param: Param[T], value: T): DataFrameOptimizer.this.type
- Attributes
- protected[org.apache.spark.ml]
- Definition Classes
- Params
- def setDoCache(value: Boolean): DataFrameOptimizer.this.type
- def setExecutorCores(value: Int): DataFrameOptimizer.this.type
- def setNumPartitions(value: Int): DataFrameOptimizer.this.type
- def setNumWorkers(value: Int): DataFrameOptimizer.this.type
- def setOutputOptions(options: Map[String, String]): DataFrameOptimizer.this.type
- def setPersistFormat(value: String): DataFrameOptimizer.this.type
- def setPersistPath(value: String): DataFrameOptimizer.this.type
- final def synchronized[T0](arg0: => T0): T0
- Definition Classes
- AnyRef
- def toString(): String
- Definition Classes
- Identifiable → AnyRef → Any
- def transform(dataset: Dataset[_]): DataFrame
- Definition Classes
- DataFrameOptimizer → Transformer
- def transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame
- Definition Classes
- Transformer
- Annotations
- @Since("2.0.0")
- def transform(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DataFrame
- Definition Classes
- Transformer
- Annotations
- @varargs() @Since("2.0.0")
- def transformSchema(schema: StructType): StructType
- Definition Classes
- DataFrameOptimizer → PipelineStage
- def transformSchema(schema: StructType, logging: Boolean): StructType
- Attributes
- protected
- Definition Classes
- PipelineStage
- Annotations
- @DeveloperApi()
- val uid: String
- Definition Classes
- DataFrameOptimizer → Identifiable
- final def wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException])
- final def wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException]) @native()
- final def wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException])
- def write: MLWriter
- Definition Classes
- DefaultParamsWritable → MLWritable
Deprecated Value Members
- def finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.Throwable]) @Deprecated
- Deprecated
(Since version 9)