KafkaIngestionJob

Instance Constructors

new KafkaIngestionJob(domain: Domain, schema: Schema, types: List[Type], path: List[Path], storageHandler: StorageHandler, schemaHandler: SchemaHandler, options: Map[String, String])(implicit settings: Settings)

domain
: Output Dataset Domain
schema
: Topic Name
types
: List of globally defined types
path
: Unused
storageHandler
: Storage Handler

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def analyze(fullTableName: String): Any

Attributes
protected
Definition Classes
SparkJob
def appendToFile(storageHandler: StorageHandler, dataToSave: DataFrame, path: Path): Unit

Saves a dataset.
Saves a dataset. If the path is empty (the first time we call metrics on the schema) then we can write.
If there's already parquet files stored in it, then create a temporary directory to compute on, and flush the path to move updated metrics in it
dataToSave
: dataset to be saved
path
: Path to save the file at

Attributes
protected
Definition Classes
SparkJob
def applyIgnore(dfIn: DataFrame): Dataset[Row]

Attributes
protected
Definition Classes
IngestionJob
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
def createViews(views: Views, sqlParameters: Map[String, String], activeEnv: Map[String, String]): Unit

Attributes
protected
Definition Classes
SparkJob
val domain: Domain

: Input Dataset Domain
: Input Dataset Domain

Definition Classes
JsonIngestionJob → IngestionJob
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def getWriteMode(): WriteMode

Definition Classes
IngestionJob
def hashCode(): Int

Definition Classes
AnyRef → Any
def ingest(dataset: DataFrame): (RDD[_], RDD[_])

Where the magic happen
Where the magic happen
dataset
input dataset as a RDD of string

Attributes
protected
Definition Classes
JsonIngestionJob → IngestionJob
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
def loadDataSet(): Try[DataFrame]

load the json as an RDD of String
load the json as an RDD of String
returns
Spark Dataframe loaded using metadata options

Attributes
protected
Definition Classes
JsonIngestionJob → IngestionJob
def loadJsonData(): Dataset[String]

Load dataset using spark csv reader and all metadata.
Load dataset using spark csv reader and all metadata. Does not infer schema. columns not defined in the schema are dropped fro the dataset (require datsets with a header)
returns
Spark DataFrame where each row holds a single string

Attributes
protected
Definition Classes
KafkaIngestionJob → JsonIngestionJob
val logger: Logger

Attributes
protected
Definition Classes
StrictLogging
lazy val metadata: Metadata

Merged metadata
Merged metadata

Definition Classes
IngestionJob
def name: String

Definition Classes
JsonIngestionJob → JobBase
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
val now: Timestamp

Definition Classes
IngestionJob
var offsets: List[(Int, Long)]
val options: Map[String, String]

Definition Classes
JsonIngestionJob → IngestionJob
def partitionDataset(dataset: DataFrame, partition: List[String]): DataFrame

Attributes
protected
Definition Classes
SparkJob
def partitionedDatasetWriter(dataset: DataFrame, partition: List[String]): DataFrameWriter[Row]

Partition a dataset using dataset columns.
Partition a dataset using dataset columns. To partition the dataset using the ingestion time, use the reserved column names :
- comet_date
- comet_year
- comet_month
- comet_day
- comet_hour
- comet_minute These columns are renamed to "date", "year", "month", "day", "hour", "minute" in the dataset and their values is set to the current date/time.
dataset
: Input dataset
partition
: list of columns to use for partitioning.
returns
The Spark session used to run this job

Attributes
protected
Definition Classes
SparkJob
val path: List[Path]

: Input dataset path
: Input dataset path

Definition Classes
JsonIngestionJob → IngestionJob
def registerUdf(udf: String): Unit

Attributes
protected
Definition Classes
SparkJob
def run(): Try[JobResult]

Main entry point as required by the Spark Job interface
Main entry point as required by the Spark Job interface
returns
: Spark Session used for the job

Definition Classes
KafkaIngestionJob → IngestionJob → JobBase
def saveAccepted(acceptedDF: DataFrame): (DataFrame, Path)

Merge new and existing dataset if required Save using overwrite / Append mode
Merge new and existing dataset if required Save using overwrite / Append mode

Attributes
protected
Definition Classes
IngestionJob
def saveRejected(rejectedRDD: RDD[String]): Try[Path]

Attributes
protected
Definition Classes
IngestionJob
val schema: Schema

: Input Dataset Schema
: Input Dataset Schema

Definition Classes
JsonIngestionJob → IngestionJob
val schemaHandler: SchemaHandler

Definition Classes
JsonIngestionJob → IngestionJob
lazy val schemaSparkType: StructType

Definition Classes
JsonIngestionJob
lazy val session: SparkSession

Definition Classes
SparkJob
implicit val settings: Settings

Definition Classes
JsonIngestionJob → JobBase
lazy val sparkEnv: SparkEnv

Definition Classes
SparkJob
val storageHandler: StorageHandler

: Storage Handler
: Storage Handler

Definition Classes
JsonIngestionJob → IngestionJob
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
val types: List[Type]

: List of globally defined types
: List of globally defined types

Definition Classes
JsonIngestionJob → IngestionJob
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Doc: package ingest

class KafkaIngestionJob extends JsonIngestionJob

Instance Constructors

new KafkaIngestionJob(domain: Domain, schema: Schema, types: List[Type], path: List[Path], storageHandler: StorageHandler, schemaHandler: SchemaHandler, options: Map[String, String])(implicit settings: Settings)

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

def analyze(fullTableName: String): Any

def appendToFile(storageHandler: StorageHandler, dataToSave: DataFrame, path: Path): Unit

def applyIgnore(dfIn: DataFrame): Dataset[Row]

final def asInstanceOf[T0]: T0

def clone(): AnyRef

def createViews(views: Views, sqlParameters: Map[String, String], activeEnv: Map[String, String]): Unit

val domain: Domain

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

final def getClass(): Class[_]

def getWriteMode(): WriteMode

def hashCode(): Int

def ingest(dataset: DataFrame): (RDD[_], RDD[_])

final def isInstanceOf[T0]: Boolean

def loadDataSet(): Try[DataFrame]

def loadJsonData(): Dataset[String]

val logger: Logger

lazy val metadata: Metadata

def name: String

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

val now: Timestamp

var offsets: List[(Int, Long)]

val options: Map[String, String]

def partitionDataset(dataset: DataFrame, partition: List[String]): DataFrame

def partitionedDatasetWriter(dataset: DataFrame, partition: List[String]): DataFrameWriter[Row]

val path: List[Path]

def registerUdf(udf: String): Unit

def run(): Try[JobResult]

def saveAccepted(acceptedDF: DataFrame): (DataFrame, Path)

def saveRejected(rejectedRDD: RDD[String]): Try[Path]

val schema: Schema

val schemaHandler: SchemaHandler

lazy val schemaSparkType: StructType

lazy val session: SparkSession

implicit val settings: Settings

lazy val sparkEnv: SparkEnv

val storageHandler: StorageHandler

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

val types: List[Type]

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from JsonIngestionJob

Inherited from IngestionJob

Inherited from SparkJob

Inherited from JobBase

Inherited from StrictLogging

Inherited from AnyRef

Inherited from Any

Ungrouped