Package 

Class ApiV1Kt

    • Field Summary

      Fields 
      Modifier and Type Field Description
      public final static Map<KClass<?>, Encoder<?>> ENCODERS
      private final static SparkContext sparkContext
      private final static TimestampType$ timestampDt
      private final static DateType$ dateDt
    • Method Summary

      Modifier and Type Method Description
      final SparkContext getSparkContext()
      final TimestampType$ getTimestampDt()
      final DateType$ getDateDt()
      final static <T extends Any> Broadcast<T> broadcast(SparkSession $self, T value) Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions.
      final static <T extends Any> Broadcast<T> broadcast(SparkContext $self, T value) Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions.
      final static <T extends Any> Dataset<T> toDS(SparkSession $self, List<T> list) Utility method to create dataset from list
      final static <T extends Any> Dataset<T> toDS(List<T> $self, SparkSession spark) Utility method to create dataset from list
      final static <T extends Any> Dataset<T> dsOf(SparkSession $self, T t) Utility method to create dataset from list
      final static <T extends Any> Encoder<T> encoder() Main method of API, which gives you seamless integration with Spark: It creates encoder for any given supported type TSupported types are data classes, primitives, and Lists, Maps and Arrays containing them
      final static <T extends Any> Encoder<T> generateEncoder(KType type, KClass<?> cls)
      final static <T extends Any, R extends Any> Dataset<R> map(Dataset<T> $self, Function1<T, R> func)
      final static <T extends Any, R extends Any> Dataset<R> flatMap(Dataset<T> $self, Function1<T, Iterator<R>> func)
      final static <T extends Any, I extends Iterable<T>> Dataset<T> flatten(Dataset<I> $self)
      final static <T extends Any, R extends Any> KeyValueGroupedDataset<R, T> groupByKey(Dataset<T> $self, Function1<T, R> func)
      final static <T extends Any, R extends Any> Dataset<R> mapPartitions(Dataset<T> $self, Function1<Iterator<T>, Iterator<R>> func)
      final static <T extends Any> Dataset<T> filterNotNull(Dataset<T> $self)
      final static <KEY extends Any, VALUE extends Any, R extends Any> KeyValueGroupedDataset<KEY, R> mapValues(KeyValueGroupedDataset<KEY, VALUE> $self, Function1<VALUE, R> func)
      final static <KEY extends Any, VALUE extends Any, R extends Any> Dataset<R> mapGroups(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<KEY, Iterator<VALUE>, R> func)
      final static <KEY extends Any, VALUE extends Any> Dataset<Pair<KEY, VALUE>> reduceGroupsK(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<VALUE, VALUE, VALUE> func)
      final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeysTuple2(Dataset<Tuple2<T1, T2>> $self)
      final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeys(Dataset<Pair<T1, T2>> $self)
      final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeysArity2(Dataset<Arity2<T1, T2>> $self)
      final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValuesTuple2(Dataset<Tuple2<T1, T2>> $self)
      final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValues(Dataset<Pair<T1, T2>> $self)
      final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValuesArity2(Dataset<Arity2<T1, T2>> $self)
      final static <K extends Any, V extends Any, U extends Any> Dataset<U> flatMapGroups(KeyValueGroupedDataset<K, V> $self, Function2<K, Iterator<V>, Iterator<U>> func)
      final static <S extends Any> S getOrNull(GroupState<S> $self)
      final static <S extends Any> S getValue(GroupState<S> $self, Object thisRef, KProperty<?> property)
      final static <S extends Any> Unit setValue(GroupState<S> $self, Object thisRef, KProperty<?> property, S value)
      final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, Function3<K, Iterator<V>, GroupState<S>, U> func)
      final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, U> func)
      final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> flatMapGroupsWithState(KeyValueGroupedDataset<K, V> $self, OutputMode outputMode, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, Iterator<U>> func)
      final static <K extends Any, V extends Any, U extends Any, R extends Any> Dataset<R> cogroup(KeyValueGroupedDataset<K, V> $self, KeyValueGroupedDataset<K, U> other, Function3<K, Iterator<V>, Iterator<U>, Iterator<R>> func)
      final static <T extends Any, R extends Any> Dataset<R> downcast(Dataset<T> $self)
      final static <R extends Any> Dataset<R> as(Dataset<?> $self)
      final static <T extends Any> TypedColumn<Object, T> as(Column $self) Provides a type hint about the expected return value of this column.
      final static <R extends Any> Dataset<R> to(Dataset<?> $self)
      final static <T extends Any> Unit forEach(Dataset<T> $self, Function1<T, Unit> func)
      final static <T extends Any> Unit forEachPartition(Dataset<T> $self, Function1<Iterator<T>, Unit> func)
      final static <T extends Any> Dataset<T> debugCodegen(Dataset<T> $self) It's hard to call Dataset.debugCodegen from kotlin, so here is utility for that
      final static <T extends Any> Dataset<T> debug(Dataset<T> $self) It's hard to call Dataset.debug from kotlin, so here is utility for that
      final static Column ==(Column $self, Column c)
      final static Column unaryMinus(Column $self) Unary minus, i.e.
      final static Column not(Column $self) Inversion of boolean expression, i.e.
      final static Column eq(Column $self, Object other) Equality test.
      final static Column ===(Column $self, Object other) Equality test.
      final static Column neq(Column $self, Object other) Inequality test.
      final static Column =!=(Column $self, Object other) Inequality test.
      final static Column gt(Column $self, Object other) Greater than.
      final static Column lt(Column $self, Object other) Less than.
      final static Column leq(Column $self, Object other) Less than or equal to.
      final static Column geq(Column $self, Object other) Greater than or equal to an expression.
      final static Column inRangeOf(Column $self, ClosedRange<?> range) True if the current column is in the given range.
      final static Column or(Column $self, Object other) Boolean OR.
      final static Column and(Column $self, Object other) Boolean AND.
      final static Column &&(Column $self, Object other) Boolean AND.
      final static Column times(Column $self, Object other) Multiplication of this expression and another expression.
      final static Column div(Column $self, Object other) Division this expression by another expression.
      final static Column rem(Column $self, Object other) Modulo (a.k.a.
      final static Column get(Column $self, Object key) An expression that gets an item at position ordinal out of an array, or gets a value by key key in a MapType.
      final static Column lit(Object a)
      final static <L extends Any, R extends Any> Dataset<Pair<L, R>> leftJoin(Dataset<L> $self, Dataset<R> right, Column col) Alias for Dataset.joinWith which passes "left" argument and respects the fact that in result of left join right relation is nullable
      final static <L extends Any, R extends Any> Dataset<Pair<L, R>> rightJoin(Dataset<L> $self, Dataset<R> right, Column col) Alias for Dataset.joinWith which passes "right" argument and respects the fact that in result of right join left relation is nullable
      final static <L extends Any, R extends Any> Dataset<Pair<L, R>> innerJoin(Dataset<L> $self, Dataset<R> right, Column col) Alias for Dataset.joinWith which passes "inner" argument
      final static <L extends Any, R extends Any> Dataset<Pair<L, R>> fullJoin(Dataset<L> $self, Dataset<R> right, Column col) Alias for Dataset.joinWith which passes "full" argument and respects the fact that in result of join any element of resulting tuple is nullable
      final static <T extends Any> Dataset<T> sort(Dataset<T> $self, Function1<Dataset<T>, Array<Column>> columns) Alias for Dataset.sort which forces user to provide sorted columns from the source dataset
      final static <T extends Any> Dataset<T> sort(Dataset<T> $self, KProperty1<T, ?> col, KProperty1<T, ?> cols) Allows to sort data class dataset on one or more of the properties of the data class.
      final static <T extends Any, R extends Any> R withCached(Dataset<T> $self, Boolean blockingUnpersist, Function1<Dataset<T>, R> executeOnCached) This function creates block, where one can call any further computations on already cached dataset Data will be unpersisted automatically at the end of computationit may be useful in many situations, for example, when one needs to write data to several targets
      ds.withCached {
        write()
           .also { it.orc("First destination") }
           .also { it.avro("Second destination") }
      }
      final static <T extends Any> List<T> toList(Dataset<Row> $self)
      final static <R extends Any> Array<R> toArray(Dataset<?> $self)
      final static <T extends Any> Column invoke(Dataset<T> $self, String colName) Selects column based on the column name and returns it as a Column.
      final static <T extends Any, U extends Any> TypedColumn<T, U> invoke(Dataset<T> $self, KProperty1<T, U> column) Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner.
      final static <T extends Any, U extends Any> TypedColumn<T, U> col(Dataset<T> $self, KProperty1<T, U> column) Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner.
      final static <T extends Any, U extends Any> TypedColumn<T, U> col(KProperty1<T, U> column) Returns a Column based on the given class attribute, not connected to a dataset.
      final static <T extends Any> Dataset<T> showDS(Dataset<T> $self, Integer numRows, Boolean truncate) Alternative to Dataset.show which returns source dataset.
      final static <T extends Any, U1 extends Any, U2 extends Any> Dataset<Pair<U1, U2>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2) Returns a new Dataset by computing the given Column expressions for each element.
      final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any> Dataset<Triple<U1, U2, U3>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3) Returns a new Dataset by computing the given Column expressions for each element.
      final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any> Dataset<Arity4<U1, U2, U3, U4>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4) Returns a new Dataset by computing the given Column expressions for each element.
      final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any, U5 extends Any> Dataset<Arity5<U1, U2, U3, U4, U5>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4, TypedColumn<T, U5> c5) Returns a new Dataset by computing the given Column expressions for each element.
      final static <T extends Any> DataType schema(Map<String, KType> map)
      final static DataType schema(KType type, Map<String, KType> map)
      final static Unit setLogLevel(SparkContext $self, SparkLogLevel level)
      • Methods inherited from class java.lang.Object

        clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
    • Method Detail

      • broadcast

         final static <T extends Any> Broadcast<T> broadcast(SparkSession $self, T value)

        Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. The variable will be sent to each cluster only once.

        Parameters:
        value - value to broadcast to the Spark nodes
      • broadcast

         final static <T extends Any> Broadcast<T> broadcast(SparkContext $self, T value)

        Broadcast a read-only variable to the cluster, returning a org.apache.spark.broadcast.Broadcast object for reading it in distributed functions. The variable will be sent to each cluster only once.

        Parameters:
        value - value to broadcast to the Spark nodes
      • toDS

         final static <T extends Any> Dataset<T> toDS(SparkSession $self, List<T> list)

        Utility method to create dataset from list

      • toDS

         final static <T extends Any> Dataset<T> toDS(List<T> $self, SparkSession spark)

        Utility method to create dataset from list

      • dsOf

         final static <T extends Any> Dataset<T> dsOf(SparkSession $self, T t)

        Utility method to create dataset from list

      • encoder

         final static <T extends Any> Encoder<T> encoder()

        Main method of API, which gives you seamless integration with Spark: It creates encoder for any given supported type T

        Supported types are data classes, primitives, and Lists, Maps and Arrays containing them

      • map

         final static <T extends Any, R extends Any> Dataset<R> map(Dataset<T> $self, Function1<T, R> func)
      • flatMap

         final static <T extends Any, R extends Any> Dataset<R> flatMap(Dataset<T> $self, Function1<T, Iterator<R>> func)
      • flatten

         final static <T extends Any, I extends Iterable<T>> Dataset<T> flatten(Dataset<I> $self)
      • groupByKey

         final static <T extends Any, R extends Any> KeyValueGroupedDataset<R, T> groupByKey(Dataset<T> $self, Function1<T, R> func)
      • filterNotNull

         final static <T extends Any> Dataset<T> filterNotNull(Dataset<T> $self)
      • mapValues

         final static <KEY extends Any, VALUE extends Any, R extends Any> KeyValueGroupedDataset<KEY, R> mapValues(KeyValueGroupedDataset<KEY, VALUE> $self, Function1<VALUE, R> func)
      • mapGroups

         final static <KEY extends Any, VALUE extends Any, R extends Any> Dataset<R> mapGroups(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<KEY, Iterator<VALUE>, R> func)
      • reduceGroupsK

         final static <KEY extends Any, VALUE extends Any> Dataset<Pair<KEY, VALUE>> reduceGroupsK(KeyValueGroupedDataset<KEY, VALUE> $self, Function2<VALUE, VALUE, VALUE> func)
      • takeKeysTuple2

         final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeysTuple2(Dataset<Tuple2<T1, T2>> $self)
      • takeKeys

         final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeys(Dataset<Pair<T1, T2>> $self)
      • takeKeysArity2

         final static <T1 extends Any, T2 extends Any> Dataset<T1> takeKeysArity2(Dataset<Arity2<T1, T2>> $self)
      • takeValuesTuple2

         final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValuesTuple2(Dataset<Tuple2<T1, T2>> $self)
      • takeValues

         final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValues(Dataset<Pair<T1, T2>> $self)
      • takeValuesArity2

         final static <T1 extends Any, T2 extends Any> Dataset<T2> takeValuesArity2(Dataset<Arity2<T1, T2>> $self)
      • getOrNull

         final static <S extends Any> S getOrNull(GroupState<S> $self)
      • mapGroupsWithState

         final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, Function3<K, Iterator<V>, GroupState<S>, U> func)
      • mapGroupsWithState

         final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> mapGroupsWithState(KeyValueGroupedDataset<K, V> $self, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, U> func)
      • flatMapGroupsWithState

         final static <K extends Any, V extends Any, S extends Any, U extends Any> Dataset<U> flatMapGroupsWithState(KeyValueGroupedDataset<K, V> $self, OutputMode outputMode, GroupStateTimeout timeoutConf, Function3<K, Iterator<V>, GroupState<S>, Iterator<U>> func)
      • cogroup

         final static <K extends Any, V extends Any, U extends Any, R extends Any> Dataset<R> cogroup(KeyValueGroupedDataset<K, V> $self, KeyValueGroupedDataset<K, U> other, Function3<K, Iterator<V>, Iterator<U>, Iterator<R>> func)
      • downcast

         final static <T extends Any, R extends Any> Dataset<R> downcast(Dataset<T> $self)
      • as

         final static <R extends Any> Dataset<R> as(Dataset<?> $self)
      • as

         final static <T extends Any> TypedColumn<Object, T> as(Column $self)

        Provides a type hint about the expected return value of this column. This information can be used by operations such as select on a Dataset to automatically convert the results into the correct JVM types.

      • to

         final static <R extends Any> Dataset<R> to(Dataset<?> $self)
      • forEach

         final static <T extends Any> Unit forEach(Dataset<T> $self, Function1<T, Unit> func)
      • debugCodegen

         final static <T extends Any> Dataset<T> debugCodegen(Dataset<T> $self)

        It's hard to call Dataset.debugCodegen from kotlin, so here is utility for that

      • debug

         final static <T extends Any> Dataset<T> debug(Dataset<T> $self)

        It's hard to call Dataset.debug from kotlin, so here is utility for that

      • ==

         final static Column ==(Column $self, Column c)
      • unaryMinus

         final static Column unaryMinus(Column $self)

        Unary minus, i.e. negate the expression.

        // Scala: select the amount column and negates all values.
        df.select( -df("amount") )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        df.select( -df("amount") )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        df.select( negate(col("amount") );
      • not

         final static Column not(Column $self)

        Inversion of boolean expression, i.e. NOT.

        // Scala: select rows that are not active (isActive === false)
        df.filter( !df("isActive") )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        df.select( !df("amount") )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        df.filter( not(df.col("isActive")) );
      • eq

         final static Column eq(Column $self, Object other)

        Equality test.

        // Scala:
        df.filter( df("colA") === df("colB") )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        df.filter( df("colA") eq df("colB") )
        // or
        df.filter( df("colA") `===` df("colB") )
        
        // Java
        import static org.apache.spark.sql.functions.*;
        df.filter( col("colA").equalTo(col("colB")) );
      • ===

         final static Column ===(Column $self, Object other)

        Equality test.

        // Scala:
        df.filter( df("colA") === df("colB") )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        df.filter( df("colA") eq df("colB") )
        // or
        df.filter( df("colA") `===` df("colB") )
        
        // Java
        import static org.apache.spark.sql.functions.*;
        df.filter( col("colA").equalTo(col("colB")) );
      • neq

         final static Column neq(Column $self, Object other)

        Inequality test.

        // Scala:
        df.select( df("colA") =!= df("colB") )
        df.select( !(df("colA") === df("colB")) )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        df.select( df("colA") neq df("colB") )
        df.select( !(df("colA") eq df("colB")) )
        // or
        df.select( df("colA") `=!=` df("colB") )
        df.select( !(df("colA") `===` df("colB")) )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        df.filter( col("colA").notEqual(col("colB")) );
      • =!=

         final static Column =!=(Column $self, Object other)

        Inequality test.

        // Scala:
        df.select( df("colA") =!= df("colB") )
        df.select( !(df("colA") === df("colB")) )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        df.select( df("colA") neq df("colB") )
        df.select( !(df("colA") eq df("colB")) )
        // or
        df.select( df("colA") `=!=` df("colB") )
        df.select( !(df("colA") `===` df("colB")) )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        df.filter( col("colA").notEqual(col("colB")) );
      • gt

         final static Column gt(Column $self, Object other)

        Greater than.

        // Scala: The following selects people older than 21.
        people.select( people("age") 21 )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        people.select( people("age") gt 21 )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.select( people.col("age").gt(21) );
      • lt

         final static Column lt(Column $self, Object other)

        Less than.

        // Scala: The following selects people younger than 21.
        people.select( people("age") < 21 )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        people.select( people("age") lt 21 )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.select( people.col("age").lt(21) );
      • leq

         final static Column leq(Column $self, Object other)

        Less than or equal to.

        // Scala: The following selects people age 21 or younger than 21.
        people.select( people("age") <= 21 )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        people.select( people("age") leq 21 )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.select( people.col("age").leq(21) );
      • geq

         final static Column geq(Column $self, Object other)

        Greater than or equal to an expression.

        // Scala: The following selects people age 21 or older than 21.
        people.select( people("age") >= 21 )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        people.select( people("age") geq 21 )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.select( people.col("age").geq(21) );
      • inRangeOf

         final static Column inRangeOf(Column $self, ClosedRange<?> range)

        True if the current column is in the given range.

        // Scala:
        df.where( df("colA").between(1, 5) )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        df.where( df("colA") inRangeOf 1..5 )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        df.where( df.col("colA").between(1, 5) );
      • or

         final static Column or(Column $self, Object other)

        Boolean OR.

        // Scala: The following selects people that are in school or employed.
        people.filter( people("inSchool") || people("isEmployed") )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        people.filter( people("inSchool") or people("isEmployed") )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.filter( people.col("inSchool").or(people.col("isEmployed")) );
      • and

         final static Column and(Column $self, Object other)

        Boolean AND.

        // Scala: The following selects people that are in school and employed at the same time.
        people.select( people("inSchool") && people("isEmployed") )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        people.filter( people("inSchool") and people("isEmployed") )
        // or
        people.filter( people("inSchool") `&&` people("isEmployed") )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.select( people.col("inSchool").and(people.col("isEmployed")) );
      • &&

         final static Column &&(Column $self, Object other)

        Boolean AND.

        // Scala: The following selects people that are in school and employed at the same time.
        people.select( people("inSchool") && people("isEmployed") )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        people.filter( people("inSchool") and people("isEmployed") )
        // or
        people.filter( people("inSchool") `&&` people("isEmployed") )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.select( people.col("inSchool").and(people.col("isEmployed")) );
      • times

         final static Column times(Column $self, Object other)

        Multiplication of this expression and another expression.

        // Scala: The following multiplies a person's height by their weight.
        people.select( people("height") * people("weight") )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        people.select( people("height") * people("weight") )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.select( people.col("height").multiply(people.col("weight")) );
      • div

         final static Column div(Column $self, Object other)

        Division this expression by another expression.

        // Scala: The following divides a person's height by their weight.
        people.select( people("height") / people("weight") )
        
        // Kotlin
        import org.jetbrains.kotlinx.spark.api.*
        people.select( people("height") / people("weight") )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        people.select( people.col("height").divide(people.col("weight")) );
      • rem

         final static Column rem(Column $self, Object other)

        Modulo (a.k.a. remainder) expression.

        // Scala:
        df.where( df("colA") % 2 === 0 )
        
        // Kotlin:
        import org.jetbrains.kotlinx.spark.api.*
        df.where( df("colA") % 2 eq 0 )
        
        // Java:
        import static org.apache.spark.sql.functions.*;
        df.where( df.col("colA").mod(2).equalTo(0) );
      • get

         final static Column get(Column $self, Object key)

        An expression that gets an item at position ordinal out of an array, or gets a value by key key in a MapType.

        // Scala:
        df.where( df("arrayColumn").getItem(0) === 5 )
        
        // Kotlin
        import org.jetbrains.kotlinx.spark.api.*
        df.where( df("arrayColumn")[0] eq 5 )
        
        // Java
        import static org.apache.spark.sql.functions.*;
        df.where( df.col("arrayColumn").getItem(0).equalTo(5) );
      • leftJoin

         final static <L extends Any, R extends Any> Dataset<Pair<L, R>> leftJoin(Dataset<L> $self, Dataset<R> right, Column col)

        Alias for Dataset.joinWith which passes "left" argument and respects the fact that in result of left join right relation is nullable

        Parameters:
        right - right dataset
        col - join condition
      • rightJoin

         final static <L extends Any, R extends Any> Dataset<Pair<L, R>> rightJoin(Dataset<L> $self, Dataset<R> right, Column col)

        Alias for Dataset.joinWith which passes "right" argument and respects the fact that in result of right join left relation is nullable

        Parameters:
        right - right dataset
        col - join condition
      • innerJoin

         final static <L extends Any, R extends Any> Dataset<Pair<L, R>> innerJoin(Dataset<L> $self, Dataset<R> right, Column col)

        Alias for Dataset.joinWith which passes "inner" argument

        Parameters:
        right - right dataset
        col - join condition
      • fullJoin

         final static <L extends Any, R extends Any> Dataset<Pair<L, R>> fullJoin(Dataset<L> $self, Dataset<R> right, Column col)

        Alias for Dataset.joinWith which passes "full" argument and respects the fact that in result of join any element of resulting tuple is nullable

        Parameters:
        right - right dataset
        col - join condition
      • sort

         final static <T extends Any> Dataset<T> sort(Dataset<T> $self, Function1<Dataset<T>, Array<Column>> columns)

        Alias for Dataset.sort which forces user to provide sorted columns from the source dataset

        Parameters:
        columns - producer of sort columns
      • sort

         final static <T extends Any> Dataset<T> sort(Dataset<T> $self, KProperty1<T, ?> col, KProperty1<T, ?> cols)

        Allows to sort data class dataset on one or more of the properties of the data class.

        val sorted: Dataset<YourClass> = unsorted.sort(YourClass::a)
        val sorted2: Dataset<YourClass> = unsorted.sort(YourClass::a, YourClass::b)
      • withCached

         final static <T extends Any, R extends Any> R withCached(Dataset<T> $self, Boolean blockingUnpersist, Function1<Dataset<T>, R> executeOnCached)

        This function creates block, where one can call any further computations on already cached dataset Data will be unpersisted automatically at the end of computation

        it may be useful in many situations, for example, when one needs to write data to several targets

        ds.withCached {
          write()
             .also { it.orc("First destination") }
             .also { it.avro("Second destination") }
        }
        Parameters:
        blockingUnpersist - if execution should be blocked until everything persisted will be deleted
        executeOnCached - Block which should be executed on cached dataset.
      • toList

         final static <T extends Any> List<T> toList(Dataset<Row> $self)
      • invoke

         final static <T extends Any> Column invoke(Dataset<T> $self, String colName)

        Selects column based on the column name and returns it as a Column.

      • invoke

         final static <T extends Any, U extends Any> TypedColumn<T, U> invoke(Dataset<T> $self, KProperty1<T, U> column)

        Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner.

            val dataset: Dataset<YourClass> = ...
            val columnA: TypedColumn<YourClass, TypeOfA> = dataset(YourClass::a)
      • col

         final static <T extends Any, U extends Any> TypedColumn<T, U> col(Dataset<T> $self, KProperty1<T, U> column)

        Helper function to quickly get a TypedColumn (or Column) from a dataset in a refactor-safe manner.

            val dataset: Dataset<YourClass> = ...
            val columnA: TypedColumn<YourClass, TypeOfA> = dataset.col(YourClass::a)
      • col

         final static <T extends Any, U extends Any> TypedColumn<T, U> col(KProperty1<T, U> column)

        Returns a Column based on the given class attribute, not connected to a dataset.

            val dataset: Dataset<YourClass> = ...
            val new: Dataset<Tuple2<TypeOfA, TypeOfB>> = dataset.select( col(YourClass::a), col(YourClass::b) )

        TODO: change example to Pairs when merged

      • showDS

         final static <T extends Any> Dataset<T> showDS(Dataset<T> $self, Integer numRows, Boolean truncate)

        Alternative to Dataset.show which returns source dataset. Useful for debug purposes when you need to view content of a dataset as an intermediate operation

      • selectTyped

         final static <T extends Any, U1 extends Any, U2 extends Any> Dataset<Pair<U1, U2>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2)

        Returns a new Dataset by computing the given Column expressions for each element.

      • selectTyped

         final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any> Dataset<Triple<U1, U2, U3>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3)

        Returns a new Dataset by computing the given Column expressions for each element.

      • selectTyped

         final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any> Dataset<Arity4<U1, U2, U3, U4>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4)

        Returns a new Dataset by computing the given Column expressions for each element.

      • selectTyped

         final static <T extends Any, U1 extends Any, U2 extends Any, U3 extends Any, U4 extends Any, U5 extends Any> Dataset<Arity5<U1, U2, U3, U4, U5>> selectTyped(Dataset<T> $self, TypedColumn<T, U1> c1, TypedColumn<T, U2> c2, TypedColumn<T, U3> c3, TypedColumn<T, U4> c4, TypedColumn<T, U5> c5)

        Returns a new Dataset by computing the given Column expressions for each element.