DataHelpers

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def appendTrailer(pathInputData: String, pathInputTrailer: String, pathOutputConcatenated: String, configuration: Configuration): Unit

Appends a trailer data to every single file in the data directory.
Appends a trailer data to every single file in the data directory. A single trailer file in the pathOutputTrailer directory should correspond to a single data file in the pathOutputData directory.
If a trailer for a given file does not exist, the file is moved as is to the output directory.
pathInputData
Input data files directory
pathInputTrailer
Input trailer files directory
pathOutputConcatenated
Output concatenated files directory
configuration
Hadoop configuration (preferably sparkSession.sparkContext.hadoopConfiguration)
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@HotSpotIntrinsicCandidate() @throws( ... )
def concatenate(sources: Seq[String], destination: String, compressToGZip: Boolean = false): Unit

Method to get data from multiple source paths and combine it into single destination path.
Method to get data from multiple source paths and combine it into single destination path.
sources
multiple source paths from which to merge the data.
destination
destination path to combine all data to.
compressToGZip
flag to compress final output file into gzip format
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def executeNonSelectSQLQueries(sqlList: Seq[String], dbConnection: Connection): Unit
def ftpTo(remoteHost: String, userName: String, password: String, sourceFile: String, destFile: String, retryFailures: Boolean, retryCount: Int, retryPauseSecs: Int, mode: String, psCmd: String): (Boolean, Boolean, String, String)
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
Annotations
@HotSpotIntrinsicCandidate()
def getEmptyLogDataFrame(sparkSession: SparkSession): DataFrame

Method to get empty dataframe with below abinitio log schema.
Method to get empty dataframe with below abinitio log schema.
record string("|") node, timestamp, component, subcomponent, event_type; string("|\n") event_text; end
def hashCode(): Int

Definition Classes
AnyRef → Any
Annotations
@HotSpotIntrinsicCandidate()
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
def loadBinaryFileAsBinaryDataFrame(filePath: String, lineDelimiter: String = "\n", minPartition: Int = 1, rowName: String = "line", spark: SparkSession): DataFrame
def loadBinaryFileAsStringDataFrame(filePath: String, lineDelimiter: String = "\n", charSetEncoding: String = "Cp1047", minPartition: Int = 1, rowName: String = "line", spark: SparkSession): DataFrame
def loadFixedWindowBinaryFileAsDataFrame(filePath: String, lineLength: Int, minPartition: Int = 1, rowName: String = "line", spark: SparkSession): DataFrame
lazy val logger: Logger

Attributes
protected
Definition Classes
LazyLogging
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
Annotations
@HotSpotIntrinsicCandidate()
final def notifyAll(): Unit

Definition Classes
AnyRef
Annotations
@HotSpotIntrinsicCandidate()
def readHiveTable(spark: SparkSession, database: String, table: String, partition: String = ""): DataFrame

Method to read data from hive table.
Method to read data from hive table.
spark
spark session
database
hive database
table
hive table.
partition
hive table partition to read data specifically from if provided.
returns
dataframe with data read from Hive Table.
def readHiveTableInChunks(spark: SparkSession, database: String, table: String, partitionKey: String, partitionValue: String): DataFrame

Reads a full hive table partition, by reading every subpartition separately and performing a union on all the final DataFrames
Reads a full hive table partition, by reading every subpartition separately and performing a union on all the final DataFrames
This function is meant to temporarily solve the problem with Hive metastore crashing when querying too many partitions at the same time.
spark
spark session
database
hive database name
table
hive table name
partitionKey
top-level partition's key
partitionValue
top-level partition's value
returns
A complete DataFrame with the selected hive table partition
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
def unionAll(df: DataFrame*): DataFrame

Method to take union of all passed dataframes.
Method to take union of all passed dataframes.
df
list of dataframes for which to take union of.
returns
union of all passed input dataframes.
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
def writeDataFrame(df: DataFrame, path: String, spark: SparkSession, props: Map[String, String], format: String, partitionColumns: List[String] = Nil, bucketColumns: List[String] = Nil, numBuckets: Option[Int] = None, sortColumns: List[String] = Nil, tableName: Option[String] = None, databaseName: Option[String] = None): Unit

Method to write data passed in dataframe in specific file format.
Method to write data passed in dataframe in specific file format.
df
dataframe containing data.
path
path to write data to.
spark
spark session.
props
underlying data source specific properties.
format
file format in which to persist data. Supported file formats are csv, text, json, parquet, orc
partitionColumns
columns to be used for partitioning.
bucketColumns
used to bucket the output by the given columns. If specified, the output is laid out on the file-system similar to Hive's bucketing scheme.
numBuckets
number of buckets to be used.
sortColumns
columns on which to order data while persisting.
tableName
table name for persisting data.
databaseName
database name for persisting data.
lazy val write_to_log: UserDefinedFunction

UDF to write logging parameters to log port.

Deprecated Value Members

def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@Deprecated @deprecated @throws( classOf[java.lang.Throwable] )
Deprecated
(Since version ) see corresponding Javadoc for more information.

Related Docs: object DataHelpers | package libs

trait DataHelpers extends LazyLogging

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

def appendTrailer(pathInputData: String, pathInputTrailer: String, pathOutputConcatenated: String, configuration: Configuration): Unit

final def asInstanceOf[T0]: T0

def clone(): AnyRef

def concatenate(sources: Seq[String], destination: String, compressToGZip: Boolean = false): Unit

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def executeNonSelectSQLQueries(sqlList: Seq[String], dbConnection: Connection): Unit

def ftpTo(remoteHost: String, userName: String, password: String, sourceFile: String, destFile: String, retryFailures: Boolean, retryCount: Int, retryPauseSecs: Int, mode: String, psCmd: String): (Boolean, Boolean, String, String)

final def getClass(): Class[_]

def getEmptyLogDataFrame(sparkSession: SparkSession): DataFrame

def hashCode(): Int

final def isInstanceOf[T0]: Boolean

def loadBinaryFileAsBinaryDataFrame(filePath: String, lineDelimiter: String = "\n", minPartition: Int = 1, rowName: String = "line", spark: SparkSession): DataFrame

def loadBinaryFileAsStringDataFrame(filePath: String, lineDelimiter: String = "\n", charSetEncoding: String = "Cp1047", minPartition: Int = 1, rowName: String = "line", spark: SparkSession): DataFrame

def loadFixedWindowBinaryFileAsDataFrame(filePath: String, lineLength: Int, minPartition: Int = 1, rowName: String = "line", spark: SparkSession): DataFrame

lazy val logger: Logger

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

def readHiveTable(spark: SparkSession, database: String, table: String, partition: String = ""): DataFrame

def readHiveTableInChunks(spark: SparkSession, database: String, table: String, partitionKey: String, partitionValue: String): DataFrame

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

def unionAll(df: DataFrame*): DataFrame

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

final def wait(): Unit

lazy val write_to_log: UserDefinedFunction

Deprecated Value Members

def finalize(): Unit

Inherited from LazyLogging

Inherited from AnyRef

Inherited from Any

Ungrouped