-
public final class ApiV1Kt
-
-
Field Summary
Fields Modifier and Type Field Description public final static Map<KClass<?>, Encoder<?>>
ENCODERS
private final static SparkContext
sparkContext
private final static TimestampType$
timestampDt
private final static DateType$
dateDt
-
Method Summary
Modifier and Type Method Description final SparkContext
getSparkContext()
final TimestampType$
getTimestampDt()
final DateType$
getDateDt()
final static <T extends Any> Broadcast<T>
broadcast(SparkSession $self, T value)
Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. final static <T extends Any> Broadcast<T>
broadcast(SparkContext $self, T value)
Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. final static <T extends Any> Dataset<T>
toDS(SparkSession $self, List<T> list)
Utility method to create dataset from list final static <T extends Any> Dataset<T>
toDS(List<T> $self, SparkSession spark)
Utility method to create dataset from list final static <T extends Any> Dataset<T>
dsOf(SparkSession $self, T t)
Utility method to create dataset from list final static <T extends Any> Encoder<T>
encoder()
Main method of API, which gives you seamless integration with Spark: It creates encoder for any given supported type TSupported types are data classes, primitives, and Lists, Maps and Arrays containing them final static <T extends Any> Encoder<T>
generateEncoder(KType type, KClass<?> cls)
final static <T extends Any, R extends Any> Dataset<R>
map(Dataset<T> $self, Function1<T, R> func)
final static <T extends Any, R extends Any> Dataset<R>
flatMap(Dataset<T> $self, Function1<T, Iterator<R>> func)
final static <T extends Any, I extends Iterable<T>> Dataset<T>
flatten(Dataset<I> $self)
final static <T extends Any, R extends Any> KeyValueGroupedDataset<R, T>
groupByKey(Dataset<T> $self, Function1<T, R> func)
final static <T extends Any, R extends Any> Dataset<R>
mapPartitions(Dataset<T> $self, Function1<Iterator<T>, Iterator<R>> func)
final static <T extends Any> Dataset<T>
filterNotNull(Dataset<T> $self)
final static <KEY extends Any, VALUE extends Any, R extends Any> KeyValueGroupedDataset<KEY, R>
mapValues(KeyValueGroupedDataset<KEY, VALUE> $self, Function1<VALUE, R> func)
final static <KEY extends Any, VALUE extends Any, R extends Any> Dataset<R>
mapGroups(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<KEY, Iterator<VALUE>, R> func)
final static <KEY extends Any, VALUE extends Any> Dataset<Pair<KEY, VALUE>>
reduceGroupsK(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<VALUE, VALUE, VALUE> func)
final static <T1 extends Any, T2 extends Any> Dataset<T1>
takeKeysTuple2(Dataset<Tuple2<T1, T2>> $self)
final static <T1 extends Any, T2 extends Any> Dataset<T1>
takeKeys(Dataset<Pair<T1, T2>> $self)
final static <T1 extends Any, T2 extends Any> Dataset<T1>
takeKeysArity2(Dataset<Arity2<T1, T2>> $self)
final static <T1 extends Any, T2 extends Any> Dataset<T2>
takeValuesTuple2(Dataset<Tuple2<T1, T2>> $self)
final static <T1 extends Any, T2 extends Any> Dataset<T2>
takeValues(Dataset<Pair<T1, T2>> $self)
final static <T1 extends Any, T2 extends Any> Dataset<T2>
takeValuesArity2(Dataset<Arity2<T1, T2>> $self)
final static <K extends Any, V extends Any, U extends Any> Dataset<U>
flatMapGroups(KeyValueGroupedDataset<K, V> $self, Function2<K, Iterator<V>, Iterator<U>> func)
final static <S extends Any> S
getOrNull(GroupState<S> $self)
final static <S extends Any> S
getValue(GroupState<S> $self, Object thisRef, KProperty<?> property)
final static <S extends Any> Unit
setValue(GroupState<S> $self, Object thisRef, KProperty<?> property, S value)
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U>
mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, Function3<K, Iterator<V>, GroupState<S>, U> func)
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U>
mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, U> func)
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U>
flatMapGroupsWithState(KeyValueGroupedDataset<K, V> $self, OutputMode outputMode, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, Iterator<U>> func)
final static <K extends Any, V extends Any, U extends Any, R extends Any> Dataset<R>
cogroup(KeyValueGroupedDataset<K, V> $self, KeyValueGroupedDataset<K, U> other, Function3<K, Iterator<V>, Iterator<U>, Iterator<R>> func)
final static <T extends Any, R extends Any> Dataset<R>
downcast(Dataset<T> $self)
final static <R extends Any> Dataset<R>
as(Dataset<?> $self)
final static <T extends Any> TypedColumn<Object, T>
as(Column $self)
Provides a type hint about the expected return value of this column. final static <R extends Any> Dataset<R>
to(Dataset<?> $self)
final static <T extends Any> Unit
forEach(Dataset<T> $self, Function1<T, Unit> func)
final static <T extends Any> Unit
forEachPartition(Dataset<T> $self, Function1<Iterator<T>, Unit> func)
final static <T extends Any> Dataset<T>
debugCodegen(Dataset<T> $self)
It's hard to call Dataset.debugCodegen
from kotlin, so here is utility for thatfinal static <T extends Any> Dataset<T>
debug(Dataset<T> $self)
It's hard to call Dataset.debug
from kotlin, so here is utility for thatfinal static Column
==(Column $self, Column c)
final static Column
unaryMinus(Column $self)
Unary minus, i.e. final static Column
not(Column $self)
Inversion of boolean expression, i.e. final static Column
eq(Column $self, Object other)
Equality test. final static Column
===(Column $self, Object other)
Equality test. final static Column
neq(Column $self, Object other)
Inequality test. final static Column
=!=(Column $self, Object other)
Inequality test. final static Column
gt(Column $self, Object other)
Greater than. final static Column
lt(Column $self, Object other)
Less than. final static Column
leq(Column $self, Object other)
Less than or equal to. final static Column
geq(Column $self, Object other)
Greater than or equal to an expression. final static Column
inRangeOf(Column $self, ClosedRange<?> range)
True if the current column is in the given range. final static Column
or(Column $self, Object other)
Boolean OR. final static Column
and(Column $self, Object other)
Boolean AND. final static Column
&&(Column $self, Object other)
Boolean AND. final static Column
times(Column $self, Object other)
Multiplication of this expression and another expression. final static Column
div(Column $self, Object other)
Division this expression by another expression. final static Column
rem(Column $self, Object other)
Modulo (a.k.a. final static Column
get(Column $self, Object key)
An expression that gets an item at position ordinal
out of an array, or gets a value by keykey
in aMapType
.final static Column
lit(Object a)
final static <L extends Any, R extends Any> Dataset<Pair<L, R>>
leftJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "left" argument and respects the fact that in result of left join right relation is nullable final static <L extends Any, R extends Any> Dataset<Pair<L, R>>
rightJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "right" argument and respects the fact that in result of right join left relation is nullable final static <L extends Any, R extends Any> Dataset<Pair<L, R>>
innerJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "inner" argument final static <L extends Any, R extends Any> Dataset<Pair<L, R>>
fullJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "full" argument and respects the fact that in result of join any element of resulting tuple is nullable final static <T extends Any> Dataset<T>
sort(Dataset<T> $self, Function1<Dataset<T>, Array<Column>> columns)
Alias for Dataset.sort which forces user to provide sorted columns from the source dataset final static <T extends Any> Dataset<T>
sort(Dataset<T> $self, KProperty1<T, ?> col, KProperty1<T, ?> cols)
Allows to sort data class dataset on one or more of the properties of the data class. final static <T extends Any, R extends Any> R
withCached(Dataset<T> $self, Boolean blockingUnpersist, Function1<Dataset<T>, R> executeOnCached)
This function creates block, where one can call any further computations on already cached dataset Data will be unpersisted automatically at the end of computationit may be useful in many situations, for example, when one needs to write data to several targets ds.withCached { write() .also { it.orc("First destination") } .also { it.avro("Second destination") } }
final static <T extends Any> List<T>
toList(Dataset<Row> $self)
final static <R extends Any> Array<R>
toArray(Dataset<?> $self)
final static <T extends Any> Column
invoke(Dataset<T> $self, String colName)
Selects column based on the column name and returns it as a Column. final static <T extends Any, U extends Any> TypedColumn<T, U>
invoke(Dataset<T> $self, KProperty1<T, U> column)
Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner. final static <T extends Any, U extends Any> TypedColumn<T, U>
col(Dataset<T> $self, KProperty1<T, U> column)
Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner. final static <T extends Any, U extends Any> TypedColumn<T, U>
col(KProperty1<T, U> column)
Returns a Column based on the given class attribute, not connected to a dataset. final static <T extends Any> Dataset<T>
showDS(Dataset<T> $self, Integer numRows, Boolean truncate)
Alternative to Dataset.show which returns source dataset. final static <T extends Any, U1 extends Any, U2 extends Any> Dataset<Pair<U1, U2>>
selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2)
Returns a new Dataset by computing the given Column expressions for each element. final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any> Dataset<Triple<U1, U2, U3>>
selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3)
Returns a new Dataset by computing the given Column expressions for each element. final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any> Dataset<Arity4<U1, U2, U3, U4>>
selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4)
Returns a new Dataset by computing the given Column expressions for each element. final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any, U5 extends Any> Dataset<Arity5<U1, U2, U3, U4, U5>>
selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4, TypedColumn<T, U5> c5)
Returns a new Dataset by computing the given Column expressions for each element. final static <T extends Any> DataType
schema(Map<String, KType> map)
final static DataType
schema(KType type, Map<String, KType> map)
final static Unit
setLogLevel(SparkContext $self, SparkLogLevel level)
-
-
Method Detail
-
getSparkContext
final SparkContext getSparkContext()
-
getTimestampDt
final TimestampType$ getTimestampDt()
-
getDateDt
final DateType$ getDateDt()
-
broadcast
final static <T extends Any> Broadcast<T> broadcast(SparkSession $self, T value)
Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. The variable will be sent to each cluster only once.
- Parameters:
value
- value to broadcast to the Spark nodes
-
broadcast
final static <T extends Any> Broadcast<T> broadcast(SparkContext $self, T value)
Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. The variable will be sent to each cluster only once.
- Parameters:
value
- value to broadcast to the Spark nodes
-
toDS
final static <T extends Any> Dataset<T> toDS(SparkSession $self, List<T> list)
Utility method to create dataset from list
-
toDS
final static <T extends Any> Dataset<T> toDS(List<T> $self, SparkSession spark)
Utility method to create dataset from list
-
dsOf
final static <T extends Any> Dataset<T> dsOf(SparkSession $self, T t)
Utility method to create dataset from list
-
encoder
final static <T extends Any> Encoder<T> encoder()
Main method of API, which gives you seamless integration with Spark: It creates encoder for any given supported type T
Supported types are data classes, primitives, and Lists, Maps and Arrays containing them
-
generateEncoder
final static <T extends Any> Encoder<T> generateEncoder(KType type, KClass<?> cls)
-
map
final static <T extends Any, R extends Any> Dataset<R> map(Dataset<T> $self, Function1<T, R> func)
-
flatMap
final static <T extends Any, R extends Any> Dataset<R> flatMap(Dataset<T> $self, Function1<T, Iterator<R>> func)
-
groupByKey
final static <T extends Any, R extends Any> KeyValueGroupedDataset<R, T> groupByKey(Dataset<T> $self, Function1<T, R> func)
-
mapPartitions
final static <T extends Any, R extends Any> Dataset<R> mapPartitions(Dataset<T> $self, Function1<Iterator<T>, Iterator<R>> func)
-
filterNotNull
final static <T extends Any> Dataset<T> filterNotNull(Dataset<T> $self)
-
mapValues
final static <KEY extends Any, VALUE extends Any, R extends Any> KeyValueGroupedDataset<KEY, R> mapValues(KeyValueGroupedDataset<KEY, VALUE> $self, Function1<VALUE, R> func)
-
mapGroups
final static <KEY extends Any, VALUE extends Any, R extends Any> Dataset<R> mapGroups(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<KEY, Iterator<VALUE>, R> func)
-
reduceGroupsK
final static <KEY extends Any, VALUE extends Any> Dataset<Pair<KEY, VALUE>> reduceGroupsK(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<VALUE, VALUE, VALUE> func)
-
takeKeysTuple2
final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeysTuple2(Dataset<Tuple2<T1, T2>> $self)
-
takeKeys
final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeys(Dataset<Pair<T1, T2>> $self)
-
takeKeysArity2
final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeysArity2(Dataset<Arity2<T1, T2>> $self)
-
takeValuesTuple2
final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValuesTuple2(Dataset<Tuple2<T1, T2>> $self)
-
takeValues
final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValues(Dataset<Pair<T1, T2>> $self)
-
takeValuesArity2
final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValuesArity2(Dataset<Arity2<T1, T2>> $self)
-
flatMapGroups
final static <K extends Any, V extends Any, U extends Any> Dataset<U> flatMapGroups(KeyValueGroupedDataset<K, V> $self, Function2<K, Iterator<V>, Iterator<U>> func)
-
getValue
final static <S extends Any> S getValue(GroupState<S> $self, Object thisRef, KProperty<?> property)
-
setValue
final static <S extends Any> Unit setValue(GroupState<S> $self, Object thisRef, KProperty<?> property, S value)
-
mapGroupsWithState
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, Function3<K, Iterator<V>, GroupState<S>, U> func)
-
mapGroupsWithState
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, U> func)
-
flatMapGroupsWithState
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> flatMapGroupsWithState(KeyValueGroupedDataset<K, V> $self, OutputMode outputMode, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, Iterator<U>> func)
-
cogroup
final static <K extends Any, V extends Any, U extends Any, R extends Any> Dataset<R> cogroup(KeyValueGroupedDataset<K, V> $self, KeyValueGroupedDataset<K, U> other, Function3<K, Iterator<V>, Iterator<U>, Iterator<R>> func)
-
as
final static <T extends Any> TypedColumn<Object, T> as(Column $self)
Provides a type hint about the expected return value of this column. This information can be used by operations such as
select
on a Dataset to automatically convert the results into the correct JVM types.
-
forEachPartition
final static <T extends Any> Unit forEachPartition(Dataset<T> $self, Function1<Iterator<T>, Unit> func)
-
debugCodegen
final static <T extends Any> Dataset<T> debugCodegen(Dataset<T> $self)
It's hard to call
Dataset.debugCodegen
from kotlin, so here is utility for that
-
debug
final static <T extends Any> Dataset<T> debug(Dataset<T> $self)
It's hard to call
Dataset.debug
from kotlin, so here is utility for that
-
==
final static Column ==(Column $self, Column c)
-
unaryMinus
final static Column unaryMinus(Column $self)
Unary minus, i.e. negate the expression.
// Scala: select the amount column and negates all values. df.select( -df("amount") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.select( -df("amount") ) // Java: import static org.apache.spark.sql.functions.*; df.select( negate(col("amount") );
-
not
final static Column not(Column $self)
Inversion of boolean expression, i.e. NOT.
// Scala: select rows that are not active (isActive === false) df.filter( !df("isActive") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.select( !df("amount") ) // Java: import static org.apache.spark.sql.functions.*; df.filter( not(df.col("isActive")) );
-
eq
final static Column eq(Column $self, Object other)
Equality test.
// Scala: df.filter( df("colA") === df("colB") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.filter( df("colA") eq df("colB") ) // or df.filter( df("colA") `===` df("colB") ) // Java import static org.apache.spark.sql.functions.*; df.filter( col("colA").equalTo(col("colB")) );
-
===
final static Column ===(Column $self, Object other)
Equality test.
// Scala: df.filter( df("colA") === df("colB") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.filter( df("colA") eq df("colB") ) // or df.filter( df("colA") `===` df("colB") ) // Java import static org.apache.spark.sql.functions.*; df.filter( col("colA").equalTo(col("colB")) );
-
neq
final static Column neq(Column $self, Object other)
Inequality test.
// Scala: df.select( df("colA") =!= df("colB") ) df.select( !(df("colA") === df("colB")) ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.select( df("colA") neq df("colB") ) df.select( !(df("colA") eq df("colB")) ) // or df.select( df("colA") `=!=` df("colB") ) df.select( !(df("colA") `===` df("colB")) ) // Java: import static org.apache.spark.sql.functions.*; df.filter( col("colA").notEqual(col("colB")) );
-
=!=
final static Column =!=(Column $self, Object other)
Inequality test.
// Scala: df.select( df("colA") =!= df("colB") ) df.select( !(df("colA") === df("colB")) ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.select( df("colA") neq df("colB") ) df.select( !(df("colA") eq df("colB")) ) // or df.select( df("colA") `=!=` df("colB") ) df.select( !(df("colA") `===` df("colB")) ) // Java: import static org.apache.spark.sql.functions.*; df.filter( col("colA").notEqual(col("colB")) );
-
gt
final static Column gt(Column $self, Object other)
Greater than.
// Scala: The following selects people older than 21. people.select( people("age") 21 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("age") gt 21 ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("age").gt(21) );
-
lt
final static Column lt(Column $self, Object other)
Less than.
// Scala: The following selects people younger than 21. people.select( people("age") < 21 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("age") lt 21 ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("age").lt(21) );
-
leq
final static Column leq(Column $self, Object other)
Less than or equal to.
// Scala: The following selects people age 21 or younger than 21. people.select( people("age") <= 21 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("age") leq 21 ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("age").leq(21) );
-
geq
final static Column geq(Column $self, Object other)
Greater than or equal to an expression.
// Scala: The following selects people age 21 or older than 21. people.select( people("age") >= 21 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("age") geq 21 ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("age").geq(21) );
-
inRangeOf
final static Column inRangeOf(Column $self, ClosedRange<?> range)
True if the current column is in the given range.
// Scala: df.where( df("colA").between(1, 5) ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.where( df("colA") inRangeOf 1..5 ) // Java: import static org.apache.spark.sql.functions.*; df.where( df.col("colA").between(1, 5) );
-
or
final static Column or(Column $self, Object other)
Boolean OR.
// Scala: The following selects people that are in school or employed. people.filter( people("inSchool") || people("isEmployed") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.filter( people("inSchool") or people("isEmployed") ) // Java: import static org.apache.spark.sql.functions.*; people.filter( people.col("inSchool").or(people.col("isEmployed")) );
-
and
final static Column and(Column $self, Object other)
Boolean AND.
// Scala: The following selects people that are in school and employed at the same time. people.select( people("inSchool") && people("isEmployed") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.filter( people("inSchool") and people("isEmployed") ) // or people.filter( people("inSchool") `&&` people("isEmployed") ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("inSchool").and(people.col("isEmployed")) );
-
&&
final static Column &&(Column $self, Object other)
Boolean AND.
// Scala: The following selects people that are in school and employed at the same time. people.select( people("inSchool") && people("isEmployed") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.filter( people("inSchool") and people("isEmployed") ) // or people.filter( people("inSchool") `&&` people("isEmployed") ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("inSchool").and(people.col("isEmployed")) );
-
times
final static Column times(Column $self, Object other)
Multiplication of this expression and another expression.
// Scala: The following multiplies a person's height by their weight. people.select( people("height") * people("weight") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("height") * people("weight") ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("height").multiply(people.col("weight")) );
-
div
final static Column div(Column $self, Object other)
Division this expression by another expression.
// Scala: The following divides a person's height by their weight. people.select( people("height") / people("weight") ) // Kotlin import org.jetbrains.kotlinx.spark.api.* people.select( people("height") / people("weight") ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("height").divide(people.col("weight")) );
-
rem
final static Column rem(Column $self, Object other)
Modulo (a.k.a. remainder) expression.
// Scala: df.where( df("colA") % 2 === 0 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.where( df("colA") % 2 eq 0 ) // Java: import static org.apache.spark.sql.functions.*; df.where( df.col("colA").mod(2).equalTo(0) );
-
get
final static Column get(Column $self, Object key)
An expression that gets an item at position
ordinal
out of an array, or gets a value by keykey
in aMapType
.// Scala: df.where( df("arrayColumn").getItem(0) === 5 ) // Kotlin import org.jetbrains.kotlinx.spark.api.* df.where( df("arrayColumn")[0] eq 5 ) // Java import static org.apache.spark.sql.functions.*; df.where( df.col("arrayColumn").getItem(0).equalTo(5) );
-
leftJoin
final static <L extends Any, R extends Any> Dataset<Pair<L, R>> leftJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "left" argument and respects the fact that in result of left join right relation is nullable
- Parameters:
right
- right datasetcol
- join condition
-
rightJoin
final static <L extends Any, R extends Any> Dataset<Pair<L, R>> rightJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "right" argument and respects the fact that in result of right join left relation is nullable
- Parameters:
right
- right datasetcol
- join condition
-
innerJoin
final static <L extends Any, R extends Any> Dataset<Pair<L, R>> innerJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "inner" argument
- Parameters:
right
- right datasetcol
- join condition
-
fullJoin
final static <L extends Any, R extends Any> Dataset<Pair<L, R>> fullJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "full" argument and respects the fact that in result of join any element of resulting tuple is nullable
- Parameters:
right
- right datasetcol
- join condition
-
sort
final static <T extends Any> Dataset<T> sort(Dataset<T> $self, Function1<Dataset<T>, Array<Column>> columns)
Alias for Dataset.sort which forces user to provide sorted columns from the source dataset
- Parameters:
columns
- producer of sort columns
-
sort
final static <T extends Any> Dataset<T> sort(Dataset<T> $self, KProperty1<T, ?> col, KProperty1<T, ?> cols)
Allows to sort data class dataset on one or more of the properties of the data class.
val sorted: Dataset<YourClass> = unsorted.sort(YourClass::a) val sorted2: Dataset<YourClass> = unsorted.sort(YourClass::a, YourClass::b)
-
withCached
final static <T extends Any, R extends Any> R withCached(Dataset<T> $self, Boolean blockingUnpersist, Function1<Dataset<T>, R> executeOnCached)
This function creates block, where one can call any further computations on already cached dataset Data will be unpersisted automatically at the end of computation
it may be useful in many situations, for example, when one needs to write data to several targets
ds.withCached { write() .also { it.orc("First destination") } .also { it.avro("Second destination") } }
- Parameters:
blockingUnpersist
- if execution should be blocked until everything persisted will be deletedexecuteOnCached
- Block which should be executed on cached dataset.
-
invoke
final static <T extends Any> Column invoke(Dataset<T> $self, String colName)
Selects column based on the column name and returns it as a Column.
-
invoke
final static <T extends Any, U extends Any> TypedColumn<T, U> invoke(Dataset<T> $self, KProperty1<T, U> column)
Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner.
val dataset: Dataset<YourClass> = ... val columnA: TypedColumn<YourClass, TypeOfA> = dataset(YourClass::a)
-
col
final static <T extends Any, U extends Any> TypedColumn<T, U> col(Dataset<T> $self, KProperty1<T, U> column)
Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner.
val dataset: Dataset<YourClass> = ... val columnA: TypedColumn<YourClass, TypeOfA> = dataset.col(YourClass::a)
-
col
final static <T extends Any, U extends Any> TypedColumn<T, U> col(KProperty1<T, U> column)
Returns a Column based on the given class attribute, not connected to a dataset.
val dataset: Dataset<YourClass> = ... val new: Dataset<Tuple2<TypeOfA, TypeOfB>> = dataset.select( col(YourClass::a), col(YourClass::b) )
TODO: change example to Pairs when merged
-
showDS
final static <T extends Any> Dataset<T> showDS(Dataset<T> $self, Integer numRows, Boolean truncate)
Alternative to Dataset.show which returns source dataset. Useful for debug purposes when you need to view content of a dataset as an intermediate operation
-
selectTyped
final static <T extends Any, U1 extends Any, U2 extends Any> Dataset<Pair<U1, U2>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2)
Returns a new Dataset by computing the given Column expressions for each element.
-
selectTyped
final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any> Dataset<Triple<U1, U2, U3>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3)
Returns a new Dataset by computing the given Column expressions for each element.
-
selectTyped
final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any> Dataset<Arity4<U1, U2, U3, U4>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4)
Returns a new Dataset by computing the given Column expressions for each element.
-
selectTyped
final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any, U5 extends Any> Dataset<Arity5<U1, U2, U3, U4, U5>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4, TypedColumn<T, U5> c5)
Returns a new Dataset by computing the given Column expressions for each element.
-
setLogLevel
final static Unit setLogLevel(SparkContext $self, SparkLogLevel level)
-
-
-
-