-
public final class ApiV1Kt
-
-
Field Summary
Fields Modifier and Type Field Description public final static Map<KClass<?>, Encoder<?>>ENCODERSprivate final static SparkContextsparkContextprivate final static TimestampType$timestampDtprivate final static DateType$dateDt
-
Method Summary
Modifier and Type Method Description final SparkContextgetSparkContext()final TimestampType$getTimestampDt()final DateType$getDateDt()final static <T extends Any> Broadcast<T>broadcast(SparkSession $self, T value)Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. final static <T extends Any> Broadcast<T>broadcast(SparkContext $self, T value)Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. final static <T extends Any> Dataset<T>toDS(SparkSession $self, List<T> list)Utility method to create dataset from list final static <T extends Any> Dataset<T>toDS(List<T> $self, SparkSession spark)Utility method to create dataset from list final static <T extends Any> Dataset<T>dsOf(SparkSession $self, T t)Utility method to create dataset from list final static <T extends Any> Encoder<T>encoder()Main method of API, which gives you seamless integration with Spark: It creates encoder for any given supported type TSupported types are data classes, primitives, and Lists, Maps and Arrays containing them final static <T extends Any> Encoder<T>generateEncoder(KType type, KClass<?> cls)final static <T extends Any, R extends Any> Dataset<R>map(Dataset<T> $self, Function1<T, R> func)final static <T extends Any, R extends Any> Dataset<R>flatMap(Dataset<T> $self, Function1<T, Iterator<R>> func)final static <T extends Any, I extends Iterable<T>> Dataset<T>flatten(Dataset<I> $self)final static <T extends Any, R extends Any> KeyValueGroupedDataset<R, T>groupByKey(Dataset<T> $self, Function1<T, R> func)final static <T extends Any, R extends Any> Dataset<R>mapPartitions(Dataset<T> $self, Function1<Iterator<T>, Iterator<R>> func)final static <T extends Any> Dataset<T>filterNotNull(Dataset<T> $self)final static <KEY extends Any, VALUE extends Any, R extends Any> KeyValueGroupedDataset<KEY, R>mapValues(KeyValueGroupedDataset<KEY, VALUE> $self, Function1<VALUE, R> func)final static <KEY extends Any, VALUE extends Any, R extends Any> Dataset<R>mapGroups(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<KEY, Iterator<VALUE>, R> func)final static <KEY extends Any, VALUE extends Any> Dataset<Pair<KEY, VALUE>>reduceGroupsK(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<VALUE, VALUE, VALUE> func)final static <T extends Any> TreduceK(Dataset<T> $self, Function2<T, T, T> func)(Kotlin-specific) Reduces the elements of this Dataset using the specified binary function. final static <T1 extends Any, T2 extends Any> Dataset<T1>takeKeysTuple2(Dataset<Tuple2<T1, T2>> $self)final static <T1 extends Any, T2 extends Any> Dataset<T1>takeKeys(Dataset<Pair<T1, T2>> $self)final static <T1 extends Any, T2 extends Any> Dataset<T1>takeKeysArity2(Dataset<Arity2<T1, T2>> $self)final static <T1 extends Any, T2 extends Any> Dataset<T2>takeValuesTuple2(Dataset<Tuple2<T1, T2>> $self)final static <T1 extends Any, T2 extends Any> Dataset<T2>takeValues(Dataset<Pair<T1, T2>> $self)final static <T1 extends Any, T2 extends Any> Dataset<T2>takeValuesArity2(Dataset<Arity2<T1, T2>> $self)final static <K extends Any, V extends Any, U extends Any> Dataset<U>flatMapGroups(KeyValueGroupedDataset<K, V> $self, Function2<K, Iterator<V>, Iterator<U>> func)final static <S extends Any> SgetOrNull(GroupState<S> $self)final static <S extends Any> SgetValue(GroupState<S> $self, Object thisRef, KProperty<?> property)final static <S extends Any> UnitsetValue(GroupState<S> $self, Object thisRef, KProperty<?> property, S value)final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U>mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, Function3<K, Iterator<V>, GroupState<S>, U> func)final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U>mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, U> func)final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U>flatMapGroupsWithState(KeyValueGroupedDataset<K, V> $self, OutputMode outputMode, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, Iterator<U>> func)final static <K extends Any, V extends Any, U extends Any, R extends Any> Dataset<R>cogroup(KeyValueGroupedDataset<K, V> $self, KeyValueGroupedDataset<K, U> other, Function3<K, Iterator<V>, Iterator<U>, Iterator<R>> func)final static <T extends Any, R extends Any> Dataset<R>downcast(Dataset<T> $self)final static <R extends Any> Dataset<R>as(Dataset<?> $self)final static <T extends Any> TypedColumn<Object, T>as(Column $self)Provides a type hint about the expected return value of this column. final static <R extends Any> Dataset<R>to(Dataset<?> $self)final static <T extends Any> UnitforEach(Dataset<T> $self, Function1<T, Unit> func)final static <T extends Any> UnitforEachPartition(Dataset<T> $self, Function1<Iterator<T>, Unit> func)final static <T extends Any> Dataset<T>debugCodegen(Dataset<T> $self)It's hard to call Dataset.debugCodegenfrom kotlin, so here is utility for thatfinal static <T extends Any> Dataset<T>debug(Dataset<T> $self)It's hard to call Dataset.debugfrom kotlin, so here is utility for thatfinal static Column==(Column $self, Column c)final static ColumnunaryMinus(Column $self)Unary minus, i.e. final static Columnnot(Column $self)Inversion of boolean expression, i.e. final static Columneq(Column $self, Object other)Equality test. final static Column===(Column $self, Object other)Equality test. final static Columnneq(Column $self, Object other)Inequality test. final static Column=!=(Column $self, Object other)Inequality test. final static Columngt(Column $self, Object other)Greater than. final static Columnlt(Column $self, Object other)Less than. final static Columnleq(Column $self, Object other)Less than or equal to. final static Columngeq(Column $self, Object other)Greater than or equal to an expression. final static ColumninRangeOf(Column $self, ClosedRange<?> range)True if the current column is in the given range. final static Columnor(Column $self, Object other)Boolean OR. final static Columnand(Column $self, Object other)Boolean AND. final static Column&&(Column $self, Object other)Boolean AND. final static Columntimes(Column $self, Object other)Multiplication of this expression and another expression. final static Columndiv(Column $self, Object other)Division this expression by another expression. final static Columnrem(Column $self, Object other)Modulo (a.k.a. final static Columnget(Column $self, Object key)An expression that gets an item at position ordinalout of an array, or gets a value by keykeyin aMapType.final static Columnlit(Object a)final static <L extends Any, R extends Any> Dataset<Pair<L, R>>leftJoin(Dataset<L> $self, Dataset<R> right, Column col)Alias for Dataset.joinWith which passes "left" argument and respects the fact that in result of left join right relation is nullable final static <L extends Any, R extends Any> Dataset<Pair<L, R>>rightJoin(Dataset<L> $self, Dataset<R> right, Column col)Alias for Dataset.joinWith which passes "right" argument and respects the fact that in result of right join left relation is nullable final static <L extends Any, R extends Any> Dataset<Pair<L, R>>innerJoin(Dataset<L> $self, Dataset<R> right, Column col)Alias for Dataset.joinWith which passes "inner" argument final static <L extends Any, R extends Any> Dataset<Pair<L, R>>fullJoin(Dataset<L> $self, Dataset<R> right, Column col)Alias for Dataset.joinWith which passes "full" argument and respects the fact that in result of join any element of resulting tuple is nullable final static <T extends Any> Dataset<T>sort(Dataset<T> $self, Function1<Dataset<T>, Array<Column>> columns)Alias for Dataset.sort which forces user to provide sorted columns from the source dataset final static <T extends Any> Dataset<T>sort(Dataset<T> $self, KProperty1<T, ?> col, KProperty1<T, ?> cols)Allows to sort data class dataset on one or more of the properties of the data class. final static <T extends Any, R extends Any> RwithCached(Dataset<T> $self, Boolean blockingUnpersist, Function1<Dataset<T>, R> executeOnCached)This function creates block, where one can call any further computations on already cached dataset Data will be unpersisted automatically at the end of computationit may be useful in many situations, for example, when one needs to write data to several targets ds.withCached { write() .also { it.orc("First destination") } .also { it.avro("Second destination") } }final static <T extends Any> List<T>toList(Dataset<Row> $self)final static <R extends Any> Array<R>toArray(Dataset<?> $self)final static <T extends Any> Columninvoke(Dataset<T> $self, String colName)Selects column based on the column name and returns it as a Column. final static <T extends Any, U extends Any> TypedColumn<T, U>invoke(Dataset<T> $self, KProperty1<T, U> column)Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner. final static <T extends Any, U extends Any> TypedColumn<T, U>col(Dataset<T> $self, KProperty1<T, U> column)Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner. final static <T extends Any, U extends Any> TypedColumn<T, U>col(KProperty1<T, U> column)Returns a Column based on the given class attribute, not connected to a dataset. final static <T extends Any> Dataset<T>showDS(Dataset<T> $self, Integer numRows, Boolean truncate)Alternative to Dataset.show which returns source dataset. final static <T extends Any, U1 extends Any, U2 extends Any> Dataset<Pair<U1, U2>>selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2)Returns a new Dataset by computing the given Column expressions for each element. final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any> Dataset<Triple<U1, U2, U3>>selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3)Returns a new Dataset by computing the given Column expressions for each element. final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any> Dataset<Arity4<U1, U2, U3, U4>>selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4)Returns a new Dataset by computing the given Column expressions for each element. final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any, U5 extends Any> Dataset<Arity5<U1, U2, U3, U4, U5>>selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4, TypedColumn<T, U5> c5)Returns a new Dataset by computing the given Column expressions for each element. final static <T extends Any> DataTypeschema(Map<String, KType> map)final static DataTypeschema(KType type, Map<String, KType> map)final static UnitsetLogLevel(SparkContext $self, SparkLogLevel level)-
-
Method Detail
-
getSparkContext
final SparkContext getSparkContext()
-
getTimestampDt
final TimestampType$ getTimestampDt()
-
getDateDt
final DateType$ getDateDt()
-
broadcast
final static <T extends Any> Broadcast<T> broadcast(SparkSession $self, T value)
Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. The variable will be sent to each cluster only once.
- Parameters:
value- value to broadcast to the Spark nodes
-
broadcast
final static <T extends Any> Broadcast<T> broadcast(SparkContext $self, T value)
Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. The variable will be sent to each cluster only once.
- Parameters:
value- value to broadcast to the Spark nodes
-
toDS
final static <T extends Any> Dataset<T> toDS(SparkSession $self, List<T> list)
Utility method to create dataset from list
-
toDS
final static <T extends Any> Dataset<T> toDS(List<T> $self, SparkSession spark)
Utility method to create dataset from list
-
dsOf
final static <T extends Any> Dataset<T> dsOf(SparkSession $self, T t)
Utility method to create dataset from list
-
encoder
final static <T extends Any> Encoder<T> encoder()
Main method of API, which gives you seamless integration with Spark: It creates encoder for any given supported type T
Supported types are data classes, primitives, and Lists, Maps and Arrays containing them
-
generateEncoder
final static <T extends Any> Encoder<T> generateEncoder(KType type, KClass<?> cls)
-
map
final static <T extends Any, R extends Any> Dataset<R> map(Dataset<T> $self, Function1<T, R> func)
-
flatMap
final static <T extends Any, R extends Any> Dataset<R> flatMap(Dataset<T> $self, Function1<T, Iterator<R>> func)
-
groupByKey
final static <T extends Any, R extends Any> KeyValueGroupedDataset<R, T> groupByKey(Dataset<T> $self, Function1<T, R> func)
-
mapPartitions
final static <T extends Any, R extends Any> Dataset<R> mapPartitions(Dataset<T> $self, Function1<Iterator<T>, Iterator<R>> func)
-
filterNotNull
final static <T extends Any> Dataset<T> filterNotNull(Dataset<T> $self)
-
mapValues
final static <KEY extends Any, VALUE extends Any, R extends Any> KeyValueGroupedDataset<KEY, R> mapValues(KeyValueGroupedDataset<KEY, VALUE> $self, Function1<VALUE, R> func)
-
mapGroups
final static <KEY extends Any, VALUE extends Any, R extends Any> Dataset<R> mapGroups(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<KEY, Iterator<VALUE>, R> func)
-
reduceGroupsK
final static <KEY extends Any, VALUE extends Any> Dataset<Pair<KEY, VALUE>> reduceGroupsK(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<VALUE, VALUE, VALUE> func)
-
reduceK
final static <T extends Any> T reduceK(Dataset<T> $self, Function2<T, T, T> func)
(Kotlin-specific) Reduces the elements of this Dataset using the specified binary function. The given
funcmust be commutative and associative or the result may be non-deterministic.
-
takeKeysTuple2
final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeysTuple2(Dataset<Tuple2<T1, T2>> $self)
-
takeKeys
final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeys(Dataset<Pair<T1, T2>> $self)
-
takeKeysArity2
final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeysArity2(Dataset<Arity2<T1, T2>> $self)
-
takeValuesTuple2
final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValuesTuple2(Dataset<Tuple2<T1, T2>> $self)
-
takeValues
final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValues(Dataset<Pair<T1, T2>> $self)
-
takeValuesArity2
final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValuesArity2(Dataset<Arity2<T1, T2>> $self)
-
flatMapGroups
final static <K extends Any, V extends Any, U extends Any> Dataset<U> flatMapGroups(KeyValueGroupedDataset<K, V> $self, Function2<K, Iterator<V>, Iterator<U>> func)
-
getValue
final static <S extends Any> S getValue(GroupState<S> $self, Object thisRef, KProperty<?> property)
-
setValue
final static <S extends Any> Unit setValue(GroupState<S> $self, Object thisRef, KProperty<?> property, S value)
-
mapGroupsWithState
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, Function3<K, Iterator<V>, GroupState<S>, U> func)
-
mapGroupsWithState
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, U> func)
-
flatMapGroupsWithState
final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> flatMapGroupsWithState(KeyValueGroupedDataset<K, V> $self, OutputMode outputMode, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, Iterator<U>> func)
-
cogroup
final static <K extends Any, V extends Any, U extends Any, R extends Any> Dataset<R> cogroup(KeyValueGroupedDataset<K, V> $self, KeyValueGroupedDataset<K, U> other, Function3<K, Iterator<V>, Iterator<U>, Iterator<R>> func)
-
as
final static <T extends Any> TypedColumn<Object, T> as(Column $self)
Provides a type hint about the expected return value of this column. This information can be used by operations such as
selecton a Dataset to automatically convert the results into the correct JVM types.
-
forEachPartition
final static <T extends Any> Unit forEachPartition(Dataset<T> $self, Function1<Iterator<T>, Unit> func)
-
debugCodegen
final static <T extends Any> Dataset<T> debugCodegen(Dataset<T> $self)
It's hard to call
Dataset.debugCodegenfrom kotlin, so here is utility for that
-
debug
final static <T extends Any> Dataset<T> debug(Dataset<T> $self)
It's hard to call
Dataset.debugfrom kotlin, so here is utility for that
-
==
final static Column ==(Column $self, Column c)
-
unaryMinus
final static Column unaryMinus(Column $self)
Unary minus, i.e. negate the expression.
// Scala: select the amount column and negates all values. df.select( -df("amount") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.select( -df("amount") ) // Java: import static org.apache.spark.sql.functions.*; df.select( negate(col("amount") );
-
not
final static Column not(Column $self)
Inversion of boolean expression, i.e. NOT.
// Scala: select rows that are not active (isActive === false) df.filter( !df("isActive") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.select( !df("amount") ) // Java: import static org.apache.spark.sql.functions.*; df.filter( not(df.col("isActive")) );
-
eq
final static Column eq(Column $self, Object other)
Equality test.
// Scala: df.filter( df("colA") === df("colB") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.filter( df("colA") eq df("colB") ) // or df.filter( df("colA") `===` df("colB") ) // Java import static org.apache.spark.sql.functions.*; df.filter( col("colA").equalTo(col("colB")) );
-
===
final static Column ===(Column $self, Object other)
Equality test.
// Scala: df.filter( df("colA") === df("colB") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.filter( df("colA") eq df("colB") ) // or df.filter( df("colA") `===` df("colB") ) // Java import static org.apache.spark.sql.functions.*; df.filter( col("colA").equalTo(col("colB")) );
-
neq
final static Column neq(Column $self, Object other)
Inequality test.
// Scala: df.select( df("colA") =!= df("colB") ) df.select( !(df("colA") === df("colB")) ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.select( df("colA") neq df("colB") ) df.select( !(df("colA") eq df("colB")) ) // or df.select( df("colA") `=!=` df("colB") ) df.select( !(df("colA") `===` df("colB")) ) // Java: import static org.apache.spark.sql.functions.*; df.filter( col("colA").notEqual(col("colB")) );
-
=!=
final static Column =!=(Column $self, Object other)
Inequality test.
// Scala: df.select( df("colA") =!= df("colB") ) df.select( !(df("colA") === df("colB")) ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.select( df("colA") neq df("colB") ) df.select( !(df("colA") eq df("colB")) ) // or df.select( df("colA") `=!=` df("colB") ) df.select( !(df("colA") `===` df("colB")) ) // Java: import static org.apache.spark.sql.functions.*; df.filter( col("colA").notEqual(col("colB")) );
-
gt
final static Column gt(Column $self, Object other)
Greater than.
// Scala: The following selects people older than 21. people.select( people("age") 21 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("age") gt 21 ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("age").gt(21) );
-
lt
final static Column lt(Column $self, Object other)
Less than.
// Scala: The following selects people younger than 21. people.select( people("age") < 21 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("age") lt 21 ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("age").lt(21) );
-
leq
final static Column leq(Column $self, Object other)
Less than or equal to.
// Scala: The following selects people age 21 or younger than 21. people.select( people("age") <= 21 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("age") leq 21 ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("age").leq(21) );
-
geq
final static Column geq(Column $self, Object other)
Greater than or equal to an expression.
// Scala: The following selects people age 21 or older than 21. people.select( people("age") >= 21 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("age") geq 21 ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("age").geq(21) );
-
inRangeOf
final static Column inRangeOf(Column $self, ClosedRange<?> range)
True if the current column is in the given range.
// Scala: df.where( df("colA").between(1, 5) ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.where( df("colA") inRangeOf 1..5 ) // Java: import static org.apache.spark.sql.functions.*; df.where( df.col("colA").between(1, 5) );
-
or
final static Column or(Column $self, Object other)
Boolean OR.
// Scala: The following selects people that are in school or employed. people.filter( people("inSchool") || people("isEmployed") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.filter( people("inSchool") or people("isEmployed") ) // Java: import static org.apache.spark.sql.functions.*; people.filter( people.col("inSchool").or(people.col("isEmployed")) );
-
and
final static Column and(Column $self, Object other)
Boolean AND.
// Scala: The following selects people that are in school and employed at the same time. people.select( people("inSchool") && people("isEmployed") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.filter( people("inSchool") and people("isEmployed") ) // or people.filter( people("inSchool") `&&` people("isEmployed") ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("inSchool").and(people.col("isEmployed")) );
-
&&
final static Column &&(Column $self, Object other)
Boolean AND.
// Scala: The following selects people that are in school and employed at the same time. people.select( people("inSchool") && people("isEmployed") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.filter( people("inSchool") and people("isEmployed") ) // or people.filter( people("inSchool") `&&` people("isEmployed") ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("inSchool").and(people.col("isEmployed")) );
-
times
final static Column times(Column $self, Object other)
Multiplication of this expression and another expression.
// Scala: The following multiplies a person's height by their weight. people.select( people("height") * people("weight") ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* people.select( people("height") * people("weight") ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("height").multiply(people.col("weight")) );
-
div
final static Column div(Column $self, Object other)
Division this expression by another expression.
// Scala: The following divides a person's height by their weight. people.select( people("height") / people("weight") ) // Kotlin import org.jetbrains.kotlinx.spark.api.* people.select( people("height") / people("weight") ) // Java: import static org.apache.spark.sql.functions.*; people.select( people.col("height").divide(people.col("weight")) );
-
rem
final static Column rem(Column $self, Object other)
Modulo (a.k.a. remainder) expression.
// Scala: df.where( df("colA") % 2 === 0 ) // Kotlin: import org.jetbrains.kotlinx.spark.api.* df.where( df("colA") % 2 eq 0 ) // Java: import static org.apache.spark.sql.functions.*; df.where( df.col("colA").mod(2).equalTo(0) );
-
get
final static Column get(Column $self, Object key)
An expression that gets an item at position
ordinalout of an array, or gets a value by keykeyin aMapType.// Scala: df.where( df("arrayColumn").getItem(0) === 5 ) // Kotlin import org.jetbrains.kotlinx.spark.api.* df.where( df("arrayColumn")[0] eq 5 ) // Java import static org.apache.spark.sql.functions.*; df.where( df.col("arrayColumn").getItem(0).equalTo(5) );
-
leftJoin
final static <L extends Any, R extends Any> Dataset<Pair<L, R>> leftJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "left" argument and respects the fact that in result of left join right relation is nullable
- Parameters:
right- right datasetcol- join condition
-
rightJoin
final static <L extends Any, R extends Any> Dataset<Pair<L, R>> rightJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "right" argument and respects the fact that in result of right join left relation is nullable
- Parameters:
right- right datasetcol- join condition
-
innerJoin
final static <L extends Any, R extends Any> Dataset<Pair<L, R>> innerJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "inner" argument
- Parameters:
right- right datasetcol- join condition
-
fullJoin
final static <L extends Any, R extends Any> Dataset<Pair<L, R>> fullJoin(Dataset<L> $self, Dataset<R> right, Column col)
Alias for Dataset.joinWith which passes "full" argument and respects the fact that in result of join any element of resulting tuple is nullable
- Parameters:
right- right datasetcol- join condition
-
sort
final static <T extends Any> Dataset<T> sort(Dataset<T> $self, Function1<Dataset<T>, Array<Column>> columns)
Alias for Dataset.sort which forces user to provide sorted columns from the source dataset
- Parameters:
columns- producer of sort columns
-
sort
final static <T extends Any> Dataset<T> sort(Dataset<T> $self, KProperty1<T, ?> col, KProperty1<T, ?> cols)
Allows to sort data class dataset on one or more of the properties of the data class.
val sorted: Dataset<YourClass> = unsorted.sort(YourClass::a) val sorted2: Dataset<YourClass> = unsorted.sort(YourClass::a, YourClass::b)
-
withCached
final static <T extends Any, R extends Any> R withCached(Dataset<T> $self, Boolean blockingUnpersist, Function1<Dataset<T>, R> executeOnCached)
This function creates block, where one can call any further computations on already cached dataset Data will be unpersisted automatically at the end of computation
it may be useful in many situations, for example, when one needs to write data to several targets
ds.withCached { write() .also { it.orc("First destination") } .also { it.avro("Second destination") } }- Parameters:
blockingUnpersist- if execution should be blocked until everything persisted will be deletedexecuteOnCached- Block which should be executed on cached dataset.
-
invoke
final static <T extends Any> Column invoke(Dataset<T> $self, String colName)
Selects column based on the column name and returns it as a Column.
-
invoke
final static <T extends Any, U extends Any> TypedColumn<T, U> invoke(Dataset<T> $self, KProperty1<T, U> column)
Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner.
val dataset: Dataset<YourClass> = ... val columnA: TypedColumn<YourClass, TypeOfA> = dataset(YourClass::a)
-
col
final static <T extends Any, U extends Any> TypedColumn<T, U> col(Dataset<T> $self, KProperty1<T, U> column)
Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner.
val dataset: Dataset<YourClass> = ... val columnA: TypedColumn<YourClass, TypeOfA> = dataset.col(YourClass::a)
-
col
final static <T extends Any, U extends Any> TypedColumn<T, U> col(KProperty1<T, U> column)
Returns a Column based on the given class attribute, not connected to a dataset.
val dataset: Dataset<YourClass> = ... val new: Dataset<Tuple2<TypeOfA, TypeOfB>> = dataset.select( col(YourClass::a), col(YourClass::b) )TODO: change example to Pairs when merged
-
showDS
final static <T extends Any> Dataset<T> showDS(Dataset<T> $self, Integer numRows, Boolean truncate)
Alternative to Dataset.show which returns source dataset. Useful for debug purposes when you need to view content of a dataset as an intermediate operation
-
selectTyped
final static <T extends Any, U1 extends Any, U2 extends Any> Dataset<Pair<U1, U2>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2)
Returns a new Dataset by computing the given Column expressions for each element.
-
selectTyped
final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any> Dataset<Triple<U1, U2, U3>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3)
Returns a new Dataset by computing the given Column expressions for each element.
-
selectTyped
final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any> Dataset<Arity4<U1, U2, U3, U4>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4)
Returns a new Dataset by computing the given Column expressions for each element.
-
selectTyped
final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any, U5 extends Any> Dataset<Arity5<U1, U2, U3, U4, U5>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4, TypedColumn<T, U5> c5)
Returns a new Dataset by computing the given Column expressions for each element.
-
setLogLevel
final static Unit setLogLevel(SparkContext $self, SparkLogLevel level)
-
-
-
-