class WordReader extends Serializable
Class to read and parse Word files.
- Alphabetic
- By Inheritance
- WordReader
- Serializable
- Serializable
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
-
new
WordReader(storeContent: Boolean = false, includePageBreaks: Boolean = false, inferTableStructure: Boolean = false)
- storeContent
Whether to include the raw file content in the output DataFrame as a separate
content
column, alongside the structured output. Default isfalse
.- includePageBreaks
Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries. Default is
false
.- inferTableStructure
Whether to generate an HTML table representation from structured table content. When enabled, a full table element is added alongside cell-level elements, based on row and column layout. Default is
false
.Example
val docDirectory = "./word-files/fake_table.docx" val wordReader = new WordReader() val wordDf = wordReader.doc(docDirectory) wordDf.show() +--------------------+--------------------+ | path| doc| +--------------------+--------------------+ |file:/content/wor...|[{Table, Header C...| +--------------------+--------------------+ wordDf.printSchema() root |-- path: string (nullable = true) |-- doc: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- elementType: string (nullable = true) | | |-- content: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true)
For more examples please refer to this notebook.
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
doc(filePath: String): DataFrame
- filePath
this is a path to a directory of word files or a path to a word file E.g. "path/word/files"
- returns
Dataframe with parsed word doc content.
- def docToHTMLElement(content: Array[Byte]): Seq[HTMLElement]
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- def getOutputColumn: String
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
- def setOutputColumn(value: String): WordReader.this.type
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()