PubTator

The PubTator format includes medical papers’ titles, abstracts, and tagged chunks.

For more information see PubTator Docs and MedMentions Docs.

readDataset is used to create a Spark DataFrame from a PubTator text file.

Example

import com.johnsnowlabs.nlp.training.PubTator

val pubTatorFile = "./src/test/resources/corpus_pubtator_sample.txt"
val pubTatorDataSet = PubTator().readDataset(ResourceHelper.spark, pubTatorFile)
pubTatorDataSet.show(1)
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|  doc_id|      finished_token|        finished_pos|        finished_ner|finished_token_metadata|finished_pos_metadata|finished_label_metadata|
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|25763772|[DCTN4, as, a, mo...|[NNP, IN, DT, NN,...|[B-T116, O, O, O,...|   [[sentence, 0], [...| [[word, DCTN4], [...|   [[word, DCTN4], [...|
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+

Linear Supertypes

Serializable, Serializable, Product, Equals, AnyRef, Any

Instance Constructors

new PubTator()

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
def readDataset(spark: SparkSession, path: String, isPaddedToken: Boolean = true): DataFrame
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Doc: package training

case class PubTator() extends Product with Serializable

Example

Instance Constructors

new PubTator()

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

def clone(): AnyRef

final def eq(arg0: AnyRef): Boolean

def finalize(): Unit

final def getClass(): Class[_]

final def isInstanceOf[T0]: Boolean

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

def readDataset(spark: SparkSession, path: String, isPaddedToken: Boolean = true): DataFrame

final def synchronized[T0](arg0: ⇒ T0): T0

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped