class ExcelReader extends Serializable
This class is used to read and parse excel files.
- Alphabetic
- By Inheritance
- ExcelReader
- Serializable
- Serializable
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
-
new
ExcelReader(titleFontSize: Int = 9, cellSeparator: String = "\t", storeContent: Boolean = false, includePageBreaks: Boolean = false, inferTableStructure: Boolean = false, appendCells: Boolean = false)
- titleFontSize
Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized). By default it is set to 9. By default, it is set to 9.
- cellSeparator
String used to join cell values in a row when assembling textual output. By default, it is set to tab seperator.
- storeContent
Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output. By default, it is set to false.
- includePageBreaks
Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries. By default, it is set to false.
- inferTableStructure
Whether to generate an HTML table representation from structured table content. When enabled, a full
` element is added alongside cell-level elements, based on row and column layout. By default, it is set to false.
- appendCells
Whether to append all rows into a single content block instead of creating separate elements per row. By default, it is set to false.
Example
val path = "./excel-files/PageBreakExample.xlsx" val excelReader = new ExcelReader() val excelDf = excelReader.xls(docDirectory)
excelDf.show() +------------------+---------------------+ |path |xls | +------------------+---------------------+ |file:/content/exce|[{Title, Date\tFri Ju| +------------------+---------------------+ excelDf.printSchema() root |-- path: string (nullable = true) |-- xls: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- elementType: string (nullable = true) | | |-- content: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true)
For more examples please refer to this notebook.
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- def getOutputColumn: String
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
- def setOutputColumn(value: String): ExcelReader.this.type
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
xls(filePath: String): DataFrame
- filePath
this is a path to a directory of excel files or a path to an excel file E.g. "path/excel/files"
- returns
Dataframe with parsed excel content.
- def xlsToHTMLElement(content: Array[Byte]): Seq[HTMLElement]
Ungrouped