class Reader2Doc extends Transformer with DefaultParamsWritable with HasOutputAnnotatorType with HasOutputAnnotationCol with HasBinaryReaderProperties with HasTextReaderProperties with HasReaderContent
The Reader2Doc annotator allows you to use the reading files more smoothly within existing Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Doc can be used for extracting structured content from various document types using Spark NLP readers. It supports reading from many files types and returns parsed output as a structured Spark DataFrame.
Supported formats include plain text, HTML, Word (.doc/.docx), Excel (.xls/.xlsx), PowerPoint (.ppt/.pptx), email files (.eml, .msg), and PDFs.
Example
import com.johnsnowlabs.reader.Reader2Doc import com. johnsnowlabs.nlp.base.DocumentAssembler import org.apache.spark.ml.Pipeline val reader2Doc = new Reader2Doc() .setContentType("application/pdf") .setContentPath(s"$pdfDirectory/") .setExplodeDocs(true) val pipeline = new Pipeline() .setStages(Array(reader2Doc)) val pipelineModel = pipeline.fit(emptyDataSet) val resultDf = pipelineModel.transform(emptyDataSet) resultDf.show() +------------------------------------------------------------------------------------------------------------------------------------+ |document | +------------------------------------------------------------------------------------------------------------------------------------+ |[{document, 0, 14, This is a Title, {pageNumber -> 1, elementType -> Title, fileName -> pdf-title.pdf}, []}] | |[{document, 15, 38, This is a narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}] | |[{document, 39, 68, This is another narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}]| +------------------------------------------------------------------------------------------------------------------------------------+
- Grouped
- Alphabetic
- By Inheritance
- Reader2Doc
- HasReaderContent
- HasTagsReaderProperties
- HasXmlReaderProperties
- HasHTMLReaderProperties
- HasReaderProperties
- HasTextReaderProperties
- HasBinaryReaderProperties
- HasPowerPointProperties
- HasPdfReaderProperties
- HasExcelReaderProperties
- HasEmailReaderProperties
- ParamsAndFeaturesWritable
- HasFeatures
- HasOutputAnnotationCol
- HasOutputAnnotatorType
- DefaultParamsWritable
- MLWritable
- Transformer
- PipelineStage
- Logging
- Params
- Serializable
- Identifiable
- AnyRef
- Any
- Hide All
- Show All
- Public
- Protected
Type Members
- type AnnotatorType = String
- Definition Classes
- HasOutputAnnotatorType
Value Members
- final def !=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- final def ##: Int
- Definition Classes
- AnyRef → Any
- final def $[T](param: Param[T]): T
- Attributes
- protected
- Definition Classes
- Params
- def $$[T](feature: StructFeature[T]): T
- Attributes
- protected
- Definition Classes
- HasFeatures
- def $$[K, V](feature: MapFeature[K, V]): Map[K, V]
- Attributes
- protected
- Definition Classes
- HasFeatures
- def $$[T](feature: SetFeature[T]): Set[T]
- Attributes
- protected
- Definition Classes
- HasFeatures
- def $$[T](feature: ArrayFeature[T]): Array[T]
- Attributes
- protected
- Definition Classes
- HasFeatures
- final def ==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- val addAttachmentContent: Param[Boolean]
- Definition Classes
- HasEmailReaderProperties
- def afterAnnotate(dataset: DataFrame): DataFrame
- val appendCells: Param[Boolean]
- Definition Classes
- HasExcelReaderProperties
- final def asInstanceOf[T0]: T0
- Definition Classes
- Any
- def buildEmptyDataFrame(dataset: Dataset[_]): DataFrame
- Definition Classes
- HasReaderContent
- def buildErrorDataFrame(dataset: Dataset[_], contentPath: String, ext: String): DataFrame
- Definition Classes
- HasReaderContent
- val cellSeparator: Param[String]
- Definition Classes
- HasExcelReaderProperties
- final def clear(param: Param[_]): Reader2Doc.this.type
- Definition Classes
- Params
- def clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.CloneNotSupportedException]) @HotSpotIntrinsicCandidate() @native()
- val contentPath: Param[String]
- Definition Classes
- HasReaderProperties
- val contentType: Param[String]
- Definition Classes
- HasReaderProperties
- def copy(extra: ParamMap): Transformer
- Definition Classes
- Reader2Doc → Transformer → PipelineStage → Params
- def copyValues[T <: Params](to: T, extra: ParamMap): T
- Attributes
- protected
- Definition Classes
- Params
- final def defaultCopy[T <: Params](extra: ParamMap): T
- Attributes
- protected
- Definition Classes
- Params
- final def eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- def equals(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef → Any
- val excludeNonText: BooleanParam
- def explainParam(param: Param[_]): String
- Definition Classes
- Params
- def explainParams(): String
- Definition Classes
- Params
- val explodeDocs: BooleanParam
- Definition Classes
- HasReaderProperties
- final def extractParamMap(): ParamMap
- Definition Classes
- Params
- final def extractParamMap(extra: ParamMap): ParamMap
- Definition Classes
- Params
- val features: ArrayBuffer[Feature[_, _, _]]
- Definition Classes
- HasFeatures
- val flattenOutput: BooleanParam
- Definition Classes
- HasReaderProperties
- def get[T](feature: StructFeature[T]): Option[T]
- Attributes
- protected
- Definition Classes
- HasFeatures
- def get[K, V](feature: MapFeature[K, V]): Option[Map[K, V]]
- Attributes
- protected
- Definition Classes
- HasFeatures
- def get[T](feature: SetFeature[T]): Option[Set[T]]
- Attributes
- protected
- Definition Classes
- HasFeatures
- def get[T](feature: ArrayFeature[T]): Option[Array[T]]
- Attributes
- protected
- Definition Classes
- HasFeatures
- final def get[T](param: Param[T]): Option[T]
- Definition Classes
- Params
- final def getClass(): Class[_ <: AnyRef]
- Definition Classes
- AnyRef → Any
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- def getContentType: String
- Definition Classes
- HasReaderContent
- final def getDefault[T](param: Param[T]): Option[T]
- Definition Classes
- Params
- val getFileName: UserDefinedFunction
- Definition Classes
- HasReaderContent
- final def getInputCol: String
- Definition Classes
- HasReaderProperties
- final def getOrDefault[T](param: Param[T]): T
- Definition Classes
- Params
- final def getOutputCol: String
Gets annotation column name going to generate
Gets annotation column name going to generate
- Definition Classes
- HasOutputAnnotationCol
- def getParam(paramName: String): Param[Any]
- Definition Classes
- Params
- val groupBrokenParagraphs: Param[Boolean]
- Definition Classes
- HasTextReaderProperties
- final def hasDefault[T](param: Param[T]): Boolean
- Definition Classes
- Params
- def hasParam(paramName: String): Boolean
- Definition Classes
- Params
- def hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- val headers: Param[Map[String, String]]
- Definition Classes
- HasHTMLReaderProperties
- val ignoreExceptions: BooleanParam
- Definition Classes
- HasReaderProperties
- val includePageBreaks: Param[Boolean]
- Definition Classes
- HasReaderProperties
- val includeSlideNotes: Param[Boolean]
- Definition Classes
- HasPowerPointProperties
- val includeTitleTag: Param[Boolean]
- Definition Classes
- HasHTMLReaderProperties
- val inferTableStructure: Param[Boolean]
- Definition Classes
- HasReaderProperties
- def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean
- Attributes
- protected
- Definition Classes
- Logging
- def initializeLogIfNecessary(isInterpreter: Boolean): Unit
- Attributes
- protected
- Definition Classes
- Logging
- final val inputCol: Param[String]
- Attributes
- protected
- Definition Classes
- HasReaderProperties
- final def isDefined(param: Param[_]): Boolean
- Definition Classes
- Params
- final def isInstanceOf[T0]: Boolean
- Definition Classes
- Any
- final def isSet(param: Param[_]): Boolean
- Definition Classes
- Params
- def isTraceEnabled(): Boolean
- Attributes
- protected
- Definition Classes
- Logging
- def listAllFilesRecursively(dir: File): Seq[File]
- Definition Classes
- HasReaderContent
- def log: Logger
- Attributes
- protected
- Definition Classes
- Logging
- def logDebug(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logDebug(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logError(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logError(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logInfo(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logInfo(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logName: String
- Attributes
- protected
- Definition Classes
- Logging
- def logTrace(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logTrace(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logWarning(msg: => String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
- def logWarning(msg: => String): Unit
- Attributes
- protected
- Definition Classes
- Logging
- val maxLineCount: Param[Int]
- Definition Classes
- HasTextReaderProperties
- final def ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- final def notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- final def notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- def onWrite(path: String, spark: SparkSession): Unit
- Attributes
- protected
- Definition Classes
- ParamsAndFeaturesWritable
- val onlyLeafNodes: Param[Boolean]
- Definition Classes
- HasXmlReaderProperties
- val outputAnnotatorType: AnnotatorType
- Definition Classes
- Reader2Doc → HasOutputAnnotatorType
- val outputAsDocument: BooleanParam
Whether to return all sentences joined into a single document
- final val outputCol: Param[String]
- Attributes
- protected
- Definition Classes
- HasOutputAnnotationCol
- val outputFormat: Param[String]
- Definition Classes
- HasHTMLReaderProperties
- val paragraphSplit: Param[String]
- Definition Classes
- HasTextReaderProperties
- lazy val params: Array[Param[_]]
- Definition Classes
- Params
- def partitionBuilder: Partition
- Attributes
- protected
- def partitionContent(partition: Partition, contentPath: String, isText: Boolean, dataset: Dataset[_]): DataFrame
- Definition Classes
- HasReaderContent
- def partitionContentFromPath(partition: Partition, contentPath: String, isText: Boolean, dataset: Dataset[_]): DataFrame
- Definition Classes
- HasReaderContent
- def partitionMixedContent(dataset: Dataset[_], dirPath: String, partitionParams: Map[String, String]): DataFrame
- Definition Classes
- HasReaderContent
- def partitionToAnnotation: UserDefinedFunction
- final val readAsImage: BooleanParam
- Definition Classes
- HasPdfReaderProperties
- def retrieveFileName(path: String): String
- Definition Classes
- HasReaderContent
- def save(path: String): Unit
- Definition Classes
- MLWritable
- Annotations
- @throws("If the input path already exists but overwrite is not enabled.") @Since("1.6.0")
- def set[T](feature: StructFeature[T], value: T): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
- def set[K, V](feature: MapFeature[K, V], value: Map[K, V]): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
- def set[T](feature: SetFeature[T], value: Set[T]): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
- def set[T](feature: ArrayFeature[T], value: Array[T]): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
- final def set(paramPair: ParamPair[_]): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- Params
- final def set(param: String, value: Any): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- Params
- final def set[T](param: Param[T], value: T): Reader2Doc.this.type
- Definition Classes
- Params
- def setAddAttachmentContent(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasEmailReaderProperties
- def setAppendCells(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasExcelReaderProperties
- def setCellSeparator(value: String): Reader2Doc.this.type
- Definition Classes
- HasExcelReaderProperties
- def setContentPath(value: String): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setContentType(value: String): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setDefault[T](feature: StructFeature[T], value: () => T): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
- def setDefault[K, V](feature: MapFeature[K, V], value: () => Map[K, V]): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
- def setDefault[T](feature: SetFeature[T], value: () => Set[T]): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
- def setDefault[T](feature: ArrayFeature[T], value: () => Array[T]): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- HasFeatures
- final def setDefault(paramPairs: ParamPair[_]*): Reader2Doc.this.type
- Attributes
- protected
- Definition Classes
- Params
- final def setDefault[T](param: Param[T], value: T): Reader2Doc.this.type
- Attributes
- protected[org.apache.spark.ml]
- Definition Classes
- Params
- def setExcludeNonText(value: Boolean): Reader2Doc.this.type
Excludes rows that are not text data.
Excludes rows that are not text data. e.g. tables
- def setExplodeDocs(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setFlattenOutput(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setGroupBrokenParagraphs(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasTextReaderProperties
- def setHeaders(value: Map[String, String]): Reader2Doc.this.type
- Definition Classes
- HasHTMLReaderProperties
- def setHeadersPython(headers: Map[String, String]): Reader2Doc.this.type
- Definition Classes
- HasHTMLReaderProperties
- def setIgnoreExceptions(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setIncludePageBreaks(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setIncludeSlideNotes(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasPowerPointProperties
- def setIncludeTitleTag(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasHTMLReaderProperties
- def setInferTableStructure(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- final def setInputCol(value: String): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setMaxLineCount(value: Int): Reader2Doc.this.type
- Definition Classes
- HasTextReaderProperties
- def setOnlyLeafNodes(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasXmlReaderProperties
- def setOutputAsDocument(value: Boolean): Reader2Doc.this.type
Whether to return all sentences joined into a single document
- final def setOutputCol(value: String): Reader2Doc.this.type
Overrides annotation column name when transforming
Overrides annotation column name when transforming
- Definition Classes
- HasOutputAnnotationCol
- def setOutputFormat(value: String): Reader2Doc.this.type
- Definition Classes
- HasHTMLReaderProperties
- def setParagraphSplit(value: String): Reader2Doc.this.type
- Definition Classes
- HasTextReaderProperties
- def setShortLineWordThreshold(value: Int): Reader2Doc.this.type
- Definition Classes
- HasTextReaderProperties
- def setStoreContent(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setThreshold(value: Double): Reader2Doc.this.type
- Definition Classes
- HasTextReaderProperties
- def setTimeout(value: Int): Reader2Doc.this.type
- Definition Classes
- HasHTMLReaderProperties
- def setTitleFontSize(value: Int): Reader2Doc.this.type
- Definition Classes
- HasReaderProperties
- def setTitleLengthSize(value: Int): Reader2Doc.this.type
- Definition Classes
- HasTextReaderProperties
- def setTitleThreshold(value: Double): Reader2Doc.this.type
- Definition Classes
- HasPdfReaderProperties
- def setXmlKeepTags(value: Boolean): Reader2Doc.this.type
- Definition Classes
- HasXmlReaderProperties
- val shortLineWordThreshold: Param[Int]
- Definition Classes
- HasTextReaderProperties
- val storeContent: Param[Boolean]
- Definition Classes
- HasReaderProperties
- val supportedTypes: Map[String, (String, Boolean)]
- Definition Classes
- HasReaderContent
- final def synchronized[T0](arg0: => T0): T0
- Definition Classes
- AnyRef
- val threshold: Param[Double]
- Definition Classes
- HasTextReaderProperties
- val timeout: Param[Int]
- Definition Classes
- HasHTMLReaderProperties
- val titleFontSize: Param[Int]
- Definition Classes
- HasReaderProperties
- val titleLengthSize: Param[Int]
- Definition Classes
- HasTextReaderProperties
- val titleThreshold: Param[Double]
- Definition Classes
- HasPdfReaderProperties
- def toString(): String
- Definition Classes
- Identifiable → AnyRef → Any
- def transform(dataset: Dataset[_]): DataFrame
- Definition Classes
- Reader2Doc → Transformer
- def transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame
- Definition Classes
- Transformer
- Annotations
- @Since("2.0.0")
- def transform(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DataFrame
- Definition Classes
- Transformer
- Annotations
- @varargs() @Since("2.0.0")
- def transformSchema(schema: StructType): StructType
- Definition Classes
- Reader2Doc → PipelineStage
- def transformSchema(schema: StructType, logging: Boolean): StructType
- Attributes
- protected
- Definition Classes
- PipelineStage
- Annotations
- @DeveloperApi()
- val uid: String
- Definition Classes
- Reader2Doc → Identifiable
- def validateRequiredParameters(): Unit
- Attributes
- protected
- final def wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException])
- final def wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException]) @native()
- final def wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException])
- def write: MLWriter
- Definition Classes
- ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
- val xmlKeepTags: Param[Boolean]
- Definition Classes
- HasXmlReaderProperties
Deprecated Value Members
- def finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.Throwable]) @Deprecated
- Deprecated
(Since version 9)