training

Type Members

case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ") extends Product with Serializable

Helper class to load a CoNLL type dataset for training.

The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

Two types of input paths are supported,

Folder: this is a path ending in *, and representing a collection of CoNLL files within a directory. E.g., 'path/to/multiple/conlls/*' Using this pattern will result in all the files being read into a single Dataframe. Some constraints apply on the schemas of the multiple files.

File: this is a path to a single file. E.g., 'path/to/single_file.conll'

Example

val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
  .show(3, false)
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|text                                            |tokens                                                    |pos                                  |label                                    |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
|BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+

trainingData.printSchema
root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- pos: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- label: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)

documentCol: Name of the DOCUMENT Annotator type column
sentenceCol: Name of the Sentences of DOCUMENT Annotator type column
tokenCol: Name of the TOKEN Annotator type column
posCol: Name of the POS Annotator type column
conllLabelIndex: Index of the column for NER Label in the dataset
conllPosIndex: Index of the column for the POS tags in the dataset
conllTextCol: Index of the column for the text in the dataset
labelCol: Name of the NAMED_ENTITY Annotator type column
explodeSentences: Whether to explode each sentence to a separate row
delimiter: Delimiter used to separate columns inside CoNLL file

class CoNLL2003NerReader extends AnyRef

Helper class for to work with CoNLL 2003 dataset for NER task Class is made for easy use from Java
case class CoNLLDocument(text: String, nerTagged: Seq[NerTaggedSentence], posTagged: Seq[PosTaggedSentence]) extends Product with Serializable

case class CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true) extends Product with Serializable

Instantiates the class to read a CoNLL-U dataset.

The dataset should be in the format of CoNLL-U and needs to be specified with readDataset, which will create a dataframe with the data.

Example

import com.johnsnowlabs.nlp.training.CoNLLU

val conlluFile = "src/test/resources/conllu/en.test.conllu"
val conllDataSet = CoNLLU(false).readDataset(ResourceHelper.spark, conlluFile)
conllDataSet.selectExpr("text", "form.result as form", "upos.result as upos", "xpos.result as xpos", "lemma.result as lemma")
  .show(1, false)
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|text                                   |form                                          |upos                                         |xpos                          |lemma                                       |
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|What if Google Morphed Into GoogleOS?  |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+

explodeSentences: Whether to split each sentence into a separate row

case class CoNLLUDocument(text: String, uPosTagged: Seq[PosTaggedSentence], xPosTagged: Seq[PosTaggedSentence], lemma: Seq[PosTaggedSentence]) extends Product with Serializable

case class POS() extends Product with Serializable

Helper class for creating DataFrames for training a part-of-speech tagger.

The dataset needs to consist of sentences on each line, where each word is delimited with its respective tag:

Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN Nov.|NNP 29|CD .|.

The sentence can then be parsed with readDataset into a column with annotations of type POS.

Example

In this example, the file test-training.txt has the content of the sentence above.

import com.johnsnowlabs.nlp.training.POS

val pos = POS()
val path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
val posDf = pos.readDataset(spark, path, "|", "tags")

posDf.selectExpr("explode(tags) as tags").show(false)
+---------------------------------------------+
|tags                                         |
+---------------------------------------------+
|[pos, 0, 5, NNP, [word -> Pierre], []]       |
|[pos, 7, 12, NNP, [word -> Vinken], []]      |
|[pos, 14, 14, ,, [word -> ,], []]            |
|[pos, 16, 17, CD, [word -> 61], []]          |
|[pos, 19, 23, NNS, [word -> years], []]      |
|[pos, 25, 27, JJ, [word -> old], []]         |
|[pos, 29, 29, ,, [word -> ,], []]            |
|[pos, 31, 34, MD, [word -> will], []]        |
|[pos, 36, 39, VB, [word -> join], []]        |
|[pos, 41, 43, DT, [word -> the], []]         |
|[pos, 45, 49, NN, [word -> board], []]       |
|[pos, 51, 52, IN, [word -> as], []]          |
|[pos, 47, 47, DT, [word -> a], []]           |
|[pos, 56, 67, JJ, [word -> nonexecutive], []]|
|[pos, 69, 76, NN, [word -> director], []]    |
|[pos, 78, 81, NNP, [word -> Nov.], []]       |
|[pos, 83, 84, CD, [word -> 29], []]          |
|[pos, 81, 81, ., [word -> .], []]            |
+---------------------------------------------+

case class PubTator() extends Product with Serializable

The PubTator format includes medical papers’ titles, abstracts, and tagged chunks.

For more information see PubTator Docs and MedMentions Docs.

readDataset is used to create a Spark DataFrame from a PubTator text file.

Example

import com.johnsnowlabs.nlp.training.PubTator

val pubTatorFile = "./src/test/resources/corpus_pubtator_sample.txt"
val pubTatorDataSet = PubTator().readDataset(ResourceHelper.spark, pubTatorFile)
pubTatorDataSet.show(1)
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|  doc_id|      finished_token|        finished_pos|        finished_ner|finished_token_metadata|finished_pos_metadata|finished_label_metadata|
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|25763772|[DCTN4, as, a, mo...|[NNP, IN, DT, NN,...|[B-T116, O, O, O,...|   [[sentence, 0], [...| [[word, DCTN4], [...|   [[word, DCTN4], [...|
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+

Value Members

object CoNLLHelper
object CoNLLUCols extends Enumeration

package training

Type Members

Example

class CoNLL2003NerReader extends AnyRef

case class CoNLLDocument(text: String, nerTagged: Seq[NerTaggedSentence], posTagged: Seq[PosTaggedSentence]) extends Product with Serializable

case class CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true) extends Product with Serializable

Example

case class CoNLLUDocument(text: String, uPosTagged: Seq[PosTaggedSentence], xPosTagged: Seq[PosTaggedSentence], lemma: Seq[PosTaggedSentence]) extends Product with Serializable

case class POS() extends Product with Serializable

Example

case class PubTator() extends Product with Serializable

Example

Value Members

object CoNLLHelper

object CoNLLUCols extends Enumeration

Ungrouped