class PdfToText extends Transformer with DefaultParamsWritable with HasInputValidator with HasInputCol with HasOutputCol with HasLocalProcess with PdfToTextTrait with HasPdfProperties
Extract text from PDF document to a single string or to several strings per each page. Input is a column with binary representation of PDF document. For the output it generates column with text and page number. Explode each page as separate row if split to page enabled.
It can be configured with the following properties:
- pageNumCol: Page number output column name.
- originCol: Input column name with original path of file.
- partitionNum: Number of partitions. By default, it is set to 0.
- storeSplittedPdf: Force to store bytes content of split pdf. By default, it is set to
false
. - splitPage: Enable/disable splitting per page to identify page numbers and improve
performance. By default, it is set to
true
. - onlyPageNum: Extract only page numbers. By default, it is set to
false
. - textStripper: Text stripper type used for output layout and formatting.
- sort: Enable/disable sorting content on the page. By default, it is set to
false
.
Example
val pdfToText = new PdfToText() .setStoreSplittedPdf(true) .setSplitPage(true) val filesDf = spark.read.format("binaryFile").load("Documents/files/pdf") val pipelineModel = new Pipeline() .setStages(Array(pdfToText)) .fit(filesDf) val pdfDf = pipelineModel.transform(filesDf) pdfDf.show() +--------------------+--------------------+------+--------------------+ | path| modificationTime|length| text| +--------------------+--------------------+------+--------------------+ |file:/Users/paula...|2025-05-15 11:33:...| 25803|This is a Title \...| |file:/Users/paula...|2025-05-15 11:33:...| 15629| \n| |file:/Users/paula...|2025-05-15 11:33:...| 15629| \n| |file:/Users/paula...|2025-05-15 11:33:...| 15629| \n| |file:/Users/paula...|2025-05-15 11:33:...| 9487| This is a page.\n| |file:/Users/paula...|2025-05-15 11:33:...| 9487|This is another p...| |file:/Users/paula...|2025-05-15 11:33:...| 9487| Yet another page.\n| |file:/Users/paula...|2025-05-15 11:56:...| 1563|Hello, this is li...| +--------------------+--------------------+------+--------------------+ pdfDf.printSchema() root |-- path: string (nullable = true) |-- modificationTime: timestamp (nullable = true) |-- length: long (nullable = true) |-- text: string (nullable = true) |-- height_dimension: integer (nullable = true) |-- width_dimension: integer (nullable = true) |-- content: binary (nullable = true) |-- exception: string (nullable = true) |-- pagenum: integer (nullable = true)
Linear Supertypes
Ordering
- Grouped
- Alphabetic
- By Inheritance
Inherited
- PdfToText
- HasPdfProperties
- ParamsAndFeaturesWritable
- HasFeatures
- PdfToTextTrait
- PdfUtils
- HasLocalProcess
- HasOutputCol
- HasInputCol
- HasInputValidator
- DefaultParamsWritable
- MLWritable
- Transformer
- PipelineStage
- Logging
- Params
- Serializable
- Serializable
- Identifiable
- AnyRef
- Any
- Hide All
- Show All
Visibility
- Public
- All
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
$[T](param: Param[T]): T
- Attributes
- protected
- Definition Classes
- Params
-
def
$$[T](feature: StructFeature[T]): T
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
$$[K, V](feature: MapFeature[K, V]): Map[K, V]
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
$$[T](feature: SetFeature[T]): Set[T]
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
$$[T](feature: ArrayFeature[T]): Array[T]
- Attributes
- protected
- Definition Classes
- HasFeatures
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
val
MAX_CHARACTER_BEFORE_HEADER: Int
- Definition Classes
- PdfUtils
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
checkAndFixPdf(content: Array[Byte]): Array[Byte]
- Definition Classes
- PdfUtils
-
final
def
clear(param: Param[_]): PdfToText.this.type
- Definition Classes
- Params
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
compareDataTypes(dtype1: DataType, dtype2: DataType): Boolean
- Definition Classes
- HasInputValidator
-
def
copy(extra: ParamMap): Transformer
- Definition Classes
- PdfToText → Transformer → PipelineStage → Params
-
def
copyValues[T <: Params](to: T, extra: ParamMap): T
- Attributes
- protected
- Definition Classes
- Params
-
final
def
defaultCopy[T <: Params](extra: ParamMap): T
- Attributes
- protected
- Definition Classes
- Params
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
explainParam(param: Param[_]): String
- Definition Classes
- Params
-
def
explainParams(): String
- Definition Classes
- Params
-
final
def
extractParamMap(): ParamMap
- Definition Classes
- Params
-
final
def
extractParamMap(extra: ParamMap): ParamMap
- Definition Classes
- Params
-
val
features: ArrayBuffer[Feature[_, _, _]]
- Definition Classes
- HasFeatures
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
def
get[T](feature: StructFeature[T]): Option[T]
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
get[K, V](feature: MapFeature[K, V]): Option[Map[K, V]]
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
get[T](feature: SetFeature[T]): Option[Set[T]]
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
get[T](feature: ArrayFeature[T]): Option[Array[T]]
- Attributes
- protected
- Definition Classes
- HasFeatures
-
final
def
get[T](param: Param[T]): Option[T]
- Definition Classes
- Params
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
getDefault[T](param: Param[T]): Option[T]
- Definition Classes
- Params
-
final
def
getInputCol: String
- Definition Classes
- HasInputCol
-
final
def
getOrDefault[T](param: Param[T]): T
- Definition Classes
- Params
-
final
def
getOutputCol: String
- Definition Classes
- HasOutputCol
-
def
getParam(paramName: String): Param[Any]
- Definition Classes
- Params
-
final
def
hasDefault[T](param: Param[T]): Boolean
- Definition Classes
- Params
-
def
hasParam(paramName: String): Boolean
- Definition Classes
- Params
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
initializeLogIfNecessary(isInterpreter: Boolean): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
final
val
inputCol: Param[String]
- Definition Classes
- HasInputCol
-
final
def
isDefined(param: Param[_]): Boolean
- Definition Classes
- Params
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
final
def
isSet(param: Param[_]): Boolean
- Definition Classes
- Params
-
def
isTraceEnabled(): Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
localProcess(input: Array[Map[String, Seq[IAnnotation]]]): Array[Map[String, Seq[IAnnotation]]]
- Definition Classes
- PdfToText → HasLocalProcess
-
def
log: Logger
- Attributes
- protected
- Definition Classes
- Logging
-
def
logDebug(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logDebug(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logError(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logError(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logInfo(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logInfo(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logName: String
- Attributes
- protected
- Definition Classes
- Logging
-
def
logTrace(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logTrace(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logWarning(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logWarning(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
def
onWrite(path: String, spark: SparkSession): Unit
- Attributes
- protected
- Definition Classes
- ParamsAndFeaturesWritable
-
final
val
onlyPageNum: BooleanParam
- Definition Classes
- HasPdfProperties
-
final
val
originCol: Param[String]
- Definition Classes
- HasPdfProperties
-
final
val
outputCol: Param[String]
- Definition Classes
- HasOutputCol
-
def
outputDataType: StructType
- Attributes
- protected
-
final
val
pageNumCol: Param[String]
- Definition Classes
- HasPdfProperties
-
lazy val
params: Array[Param[_]]
- Definition Classes
- Params
-
final
val
partitionNum: IntParam
- Definition Classes
- HasPdfProperties
-
def
pdfToText(content: Array[Byte], onlyPageNum: Boolean, splitPage: Boolean, storeSplittedPdf: Boolean, sort: Boolean, textStripper: String): Seq[(String, Int, Int, Array[Byte], String, Int)]
- Definition Classes
- PdfToTextTrait
-
def
save(path: String): Unit
- Definition Classes
- MLWritable
- Annotations
- @Since( "1.6.0" ) @throws( ... )
-
def
set[T](feature: StructFeature[T], value: T): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
set[K, V](feature: MapFeature[K, V], value: Map[K, V]): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
set[T](feature: SetFeature[T], value: Set[T]): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
set[T](feature: ArrayFeature[T], value: Array[T]): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
-
final
def
set(paramPair: ParamPair[_]): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- Params
-
final
def
set(param: String, value: Any): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- Params
-
final
def
set[T](param: Param[T], value: T): PdfToText.this.type
- Definition Classes
- Params
-
def
setDefault[T](feature: StructFeature[T], value: () ⇒ T): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
setDefault[K, V](feature: MapFeature[K, V], value: () ⇒ Map[K, V]): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
setDefault[T](feature: SetFeature[T], value: () ⇒ Set[T]): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
-
def
setDefault[T](feature: ArrayFeature[T], value: () ⇒ Array[T]): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
-
final
def
setDefault(paramPairs: ParamPair[_]*): PdfToText.this.type
- Attributes
- protected
- Definition Classes
- Params
-
final
def
setDefault[T](param: Param[T], value: T): PdfToText.this.type
- Attributes
- protected[org.apache.spark.ml]
- Definition Classes
- Params
-
def
setInputCol(value: String): PdfToText.this.type
- value
Name of input annotation col
-
def
setOnlyPageNum(value: Boolean): PdfToText.this.type
- Definition Classes
- HasPdfProperties
-
def
setOriginCol(value: String): PdfToText.this.type
- Definition Classes
- HasPdfProperties
-
def
setOutputCol(value: String): PdfToText.this.type
- value
Name of extraction output col
-
def
setPageNumCol(value: String): PdfToText.this.type
- Definition Classes
- HasPdfProperties
-
def
setPartitionNum(value: Int): PdfToText.this.type
- Definition Classes
- HasPdfProperties
-
def
setSort(value: Boolean): PdfToText.this.type
- Definition Classes
- HasPdfProperties
-
def
setSplitPage(value: Boolean): PdfToText.this.type
- Definition Classes
- HasPdfProperties
-
def
setStoreSplittedPdf(value: Boolean): PdfToText.this.type
- Definition Classes
- HasPdfProperties
-
def
setTextStripper(value: String): PdfToText.this.type
- Definition Classes
- HasPdfProperties
-
final
val
sort: BooleanParam
- Definition Classes
- HasPdfProperties
-
final
val
splitPage: BooleanParam
- Definition Classes
- HasPdfProperties
-
final
val
storeSplittedPdf: BooleanParam
- Definition Classes
- HasPdfProperties
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
final
val
textStripper: Param[String]
- Definition Classes
- HasPdfProperties
-
def
toString(): String
- Definition Classes
- Identifiable → AnyRef → Any
-
def
transform(df: Dataset[_]): DataFrame
- Definition Classes
- PdfToText → Transformer
-
def
transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame
- Definition Classes
- Transformer
- Annotations
- @Since( "2.0.0" )
-
def
transform(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DataFrame
- Definition Classes
- Transformer
- Annotations
- @Since( "2.0.0" ) @varargs()
-
def
transformSchema(schema: StructType): StructType
- Definition Classes
- PdfToText → PipelineStage
-
def
transformSchema(schema: StructType, logging: Boolean): StructType
- Attributes
- protected
- Definition Classes
- PipelineStage
- Annotations
- @DeveloperApi()
-
val
uid: String
- Definition Classes
- PdfToText → HasInputValidator → Identifiable
-
def
validateInputCol(schema: StructType, colName: String, colType: DataType): Unit
- Definition Classes
- HasInputValidator
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
write: MLWriter
- Definition Classes
- ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable