XmlIngestionJob

Instance Constructors

new XmlIngestionJob(domain: Domain, schema: Schema, types: List[Type], path: List[Path], storageHandler: StorageHandler, schemaHandler: SchemaHandler, options: Map[String, String])(implicit settings: Settings)

domain
: Input Dataset Domain
schema
: Input Dataset Schema
types
: List of globally defined types
path
: Input dataset path
storageHandler
: Storage Handler

Type Members

type JdbcConfigName = String

Definition Classes
JobBase

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def analyze(fullTableName: String): Any

Attributes
protected
Definition Classes
SparkJob
def appendToFile(storageHandler: StorageHandler, dataToSave: DataFrame, path: Path, datasetName: String, tableName: String): Unit

Saves a dataset.
Saves a dataset. If the path is empty (the first time we call metrics on the schema) then we can write.
If there's already parquet files stored in it, then create a temporary directory to compute on, and flush the path to move updated metrics in it
dataToSave
: dataset to be saved
path
: Path to save the file at

Attributes
protected
Definition Classes
SparkJob
def applyIgnore(dfIn: DataFrame): Dataset[Row]

Attributes
protected
Definition Classes
IngestionJob
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
def createSparkViews(views: Views, sqlParameters: Map[String, String]): Unit

Attributes
protected
Definition Classes
SparkJob
val domain: Domain

: Input Dataset Domain
: Input Dataset Domain

Definition Classes
XmlIngestionJob → IngestionJob
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def getWriteMode(): WriteMode

Definition Classes
IngestionJob
def hashCode(): Int

Definition Classes
AnyRef → Any
def ingest(dataset: DataFrame): (RDD[_], RDD[_])

Where the magic happen
Where the magic happen
dataset
input dataset as a RDD of string

Attributes
protected
Definition Classes
XmlIngestionJob → IngestionJob
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
def loadDataSet(): Try[DataFrame]

load the json as an RDD of String
load the json as an RDD of String
returns
Spark Dataframe loaded using metadata options

Attributes
protected
Definition Classes
XmlIngestionJob → IngestionJob
val logger: Logger

Attributes
protected
Definition Classes
StrictLogging
lazy val metadata: Metadata

Merged metadata
Merged metadata

Definition Classes
IngestionJob
def name: String

Definition Classes
XmlIngestionJob → JobBase
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
val now: Timestamp

Definition Classes
IngestionJob
val options: Map[String, String]

Definition Classes
XmlIngestionJob → IngestionJob
def parseViewDefinition(valueWithEnv: String): (SinkType, Option[JdbcConfigName], String)

Attributes
protected
Definition Classes
JobBase
def partitionDataset(dataset: DataFrame, partition: List[String]): DataFrame

Attributes
protected
Definition Classes
SparkJob
def partitionedDatasetWriter(dataset: DataFrame, partition: List[String]): DataFrameWriter[Row]

Partition a dataset using dataset columns.
Partition a dataset using dataset columns. To partition the dataset using the ingestion time, use the reserved column names :
- comet_date
- comet_year
- comet_month
- comet_day
- comet_hour
- comet_minute These columns are renamed to "date", "year", "month", "day", "hour", "minute" in the dataset and their values is set to the current date/time.
dataset
: Input dataset
partition
: list of columns to use for partitioning.
returns
The Spark session used to run this job

Attributes
protected
Definition Classes
SparkJob
val path: List[Path]

: Input dataset path
: Input dataset path

Definition Classes
XmlIngestionJob → IngestionJob
def registerUdf(udf: String): Unit

Attributes
protected
Definition Classes
SparkJob
def reorderAttributes(dataFrame: DataFrame): List[Attribute]

Definition Classes
IngestionJob
def run(): Try[JobResult]

Main entry point as required by the Spark Job interface
Main entry point as required by the Spark Job interface
returns
: Spark Session used for the job

Definition Classes
IngestionJob → JobBase
def saveAccepted(acceptedDF: DataFrame): (DataFrame, Path)

Merge new and existing dataset if required Save using overwrite / Append mode
Merge new and existing dataset if required Save using overwrite / Append mode

Attributes
protected
Definition Classes
IngestionJob
def saveRejected(rejectedRDD: RDD[String]): Try[Path]

Attributes
protected
Definition Classes
IngestionJob
val schema: Schema

: Input Dataset Schema
: Input Dataset Schema

Definition Classes
XmlIngestionJob → IngestionJob
val schemaHandler: SchemaHandler

Definition Classes
XmlIngestionJob → IngestionJob
lazy val schemaSparkType: StructType
lazy val session: SparkSession

Definition Classes
SparkJob
implicit val settings: Settings

Definition Classes
XmlIngestionJob → JobBase
lazy val sparkEnv: SparkEnv

Definition Classes
SparkJob
val storageHandler: StorageHandler

: Storage Handler
: Storage Handler

Definition Classes
XmlIngestionJob → IngestionJob
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
val types: List[Type]

: List of globally defined types
: List of globally defined types

Definition Classes
XmlIngestionJob → IngestionJob
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Doc: package ingest

class XmlIngestionJob extends IngestionJob

Instance Constructors

new XmlIngestionJob(domain: Domain, schema: Schema, types: List[Type], path: List[Path], storageHandler: StorageHandler, schemaHandler: SchemaHandler, options: Map[String, String])(implicit settings: Settings)

Type Members

type JdbcConfigName = String

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

def analyze(fullTableName: String): Any

def appendToFile(storageHandler: StorageHandler, dataToSave: DataFrame, path: Path, datasetName: String, tableName: String): Unit

def applyIgnore(dfIn: DataFrame): Dataset[Row]

final def asInstanceOf[T0]: T0

def clone(): AnyRef

def createSparkViews(views: Views, sqlParameters: Map[String, String]): Unit

val domain: Domain

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

final def getClass(): Class[_]

def getWriteMode(): WriteMode

def hashCode(): Int

def ingest(dataset: DataFrame): (RDD[_], RDD[_])

final def isInstanceOf[T0]: Boolean

def loadDataSet(): Try[DataFrame]

val logger: Logger

lazy val metadata: Metadata

def name: String

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

val now: Timestamp

val options: Map[String, String]

def parseViewDefinition(valueWithEnv: String): (SinkType, Option[JdbcConfigName], String)

def partitionDataset(dataset: DataFrame, partition: List[String]): DataFrame

def partitionedDatasetWriter(dataset: DataFrame, partition: List[String]): DataFrameWriter[Row]

val path: List[Path]

def registerUdf(udf: String): Unit

def reorderAttributes(dataFrame: DataFrame): List[Attribute]

def run(): Try[JobResult]

def saveAccepted(acceptedDF: DataFrame): (DataFrame, Path)

def saveRejected(rejectedRDD: RDD[String]): Try[Path]

val schema: Schema

val schemaHandler: SchemaHandler

lazy val schemaSparkType: StructType

lazy val session: SparkSession

implicit val settings: Settings

lazy val sparkEnv: SparkEnv

val storageHandler: StorageHandler

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

val types: List[Type]

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from IngestionJob

Inherited from SparkJob

Inherited from JobBase

Inherited from StrictLogging

Inherited from AnyRef

Inherited from Any

Ungrouped