c

com.johnsnowlabs.reader

PowerPointReader

class PowerPointReader extends Serializable

Class to read and parse PowerPoint files.

Linear Supertypes
Serializable, AnyRef, Any
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. PowerPointReader
  2. Serializable
  3. AnyRef
  4. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. Protected

Instance Constructors

  1. new PowerPointReader(storeContent: Boolean = false, inferTableStructure: Boolean = false, includeSlideNotes: Boolean = false, outputFormat: String = "json-table")

    storeContent

    Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output. The default value is false.

    inferTableStructure

    Whether to generate an HTML table representation from structured table content. When enabled, a full

    element is added alongside cell-level elements, based on row and column layout. The default value is false.

    includeSlideNotes

    Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements. The default value is false. docPath: this is a path to a directory of Excel files or a path to an HTML file E.g. "path/power-point/files"

    Example

    val path = "./ppt-files/fake-power-point.pptx"
    val powerPointReader = new PowerPointReader()
    val pptDf = powerPointReader.ppt(path)
    pptDf.show()
    +--------------------+--------------------+
    |                path|                 ppt|
    +--------------------+--------------------+
    |file:/content/ppt...|[{Title, Adding a...|
    +--------------------+--------------------+
    
    pptDf.printSchema()
    root
     |-- path: string (nullable = true)
     |-- ppt: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- elementType: string (nullable = true)
     |    |    |-- content: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)

    For more examples please refer to this notebook.

    Value Members

    1. final def !=(arg0: Any): Boolean
      Definition Classes
      AnyRef → Any
    2. final def ##: Int
      Definition Classes
      AnyRef → Any
    3. final def ==(arg0: Any): Boolean
      Definition Classes
      AnyRef → Any
    4. final def asInstanceOf[T0]: T0
      Definition Classes
      Any
    5. def clone(): AnyRef
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      @throws(classOf[java.lang.CloneNotSupportedException]) @HotSpotIntrinsicCandidate() @native()
    6. final def eq(arg0: AnyRef): Boolean
      Definition Classes
      AnyRef
    7. def equals(arg0: AnyRef): Boolean
      Definition Classes
      AnyRef → Any
    8. final def getClass(): Class[_ <: AnyRef]
      Definition Classes
      AnyRef → Any
      Annotations
      @HotSpotIntrinsicCandidate() @native()
    9. def getOutputColumn: String
    10. def hashCode(): Int
      Definition Classes
      AnyRef → Any
      Annotations
      @HotSpotIntrinsicCandidate() @native()
    11. final def isInstanceOf[T0]: Boolean
      Definition Classes
      Any
    12. final def ne(arg0: AnyRef): Boolean
      Definition Classes
      AnyRef
    13. final def notify(): Unit
      Definition Classes
      AnyRef
      Annotations
      @HotSpotIntrinsicCandidate() @native()
    14. final def notifyAll(): Unit
      Definition Classes
      AnyRef
      Annotations
      @HotSpotIntrinsicCandidate() @native()
    15. def ppt(filePath: String): DataFrame

      filePath

      this is a path to a directory of ppt files or a path to an ppt file E.g. "path/ppts/files"

      returns

      Dataframe with parsed power point content.

    16. def pptToHTMLElement(content: Array[Byte]): Seq[HTMLElement]
    17. def setOutputColumn(value: String): PowerPointReader.this.type
    18. final def synchronized[T0](arg0: => T0): T0
      Definition Classes
      AnyRef
    19. val titleFontSizeThreshold: Int
    20. def toString(): String
      Definition Classes
      AnyRef → Any
    21. final def wait(arg0: Long, arg1: Int): Unit
      Definition Classes
      AnyRef
      Annotations
      @throws(classOf[java.lang.InterruptedException])
    22. final def wait(arg0: Long): Unit
      Definition Classes
      AnyRef
      Annotations
      @throws(classOf[java.lang.InterruptedException]) @native()
    23. final def wait(): Unit
      Definition Classes
      AnyRef
      Annotations
      @throws(classOf[java.lang.InterruptedException])

    Deprecated Value Members

    1. def finalize(): Unit
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      @throws(classOf[java.lang.Throwable]) @Deprecated
      Deprecated

      (Since version 9)

    Inherited from Serializable

    Inherited from AnyRef

    Inherited from Any

    Ungrouped