training

Type Members

case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true) extends Product with Serializable

Helper class to load a CoNLL type dataset for training.

The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

Example

val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
  .show(3, false)
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|text                                            |tokens                                                    |pos                                  |label                                    |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
|BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+

trainingData.printSchema
root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- pos: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- label: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)

documentCol: Name of the DOCUMENT Annotator type column
sentenceCol: Name of the Sentences of DOCUMENT Annotator type column
tokenCol: Name of the TOKEN Annotator type column
posCol: Name of the POS Annotator type column
conllLabelIndex: Index of the column for NER Label in the dataset
conllPosIndex: Index of the column for the POS tags in the dataset
conllTextCol: Index of the column for the text in the dataset
labelCol: Name of the NAMED_ENTITY Annotator type column
explodeSentences: Whether to explode each sentence to a separate row

class CoNLL2003NerReader extends AnyRef

Helper class for to work with CoNLL 2003 dataset for NER task Class is made for easy use from Java
case class CoNLLDocument(text: String, nerTagged: Seq[NerTaggedSentence], posTagged: Seq[PosTaggedSentence]) extends Product with Serializable
case class CoNLLU(explodeSentences: Boolean = true) extends Product with Serializable
case class CoNLLUDocument(text: String, uPosTagged: Seq[PosTaggedSentence], xPosTagged: Seq[PosTaggedSentence], lemma: Seq[PosTaggedSentence]) extends Product with Serializable

case class POS() extends Product with Serializable

Helper class for creating DataFrames for training a part-of-speech tagger.

The dataset needs to consist of sentences on each line, where each word is delimited with its respective tag:

Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN Nov.|NNP 29|CD .|.

The sentence can then be parsed with readDataset into a column with annotations of type POS.

Example

In this example, the file test-training.txt has the content of the sentence above.

import com.johnsnowlabs.nlp.training.POS

val pos = POS()
val path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
val posDf = pos.readDataset(spark, path, "|", "tags")

posDf.selectExpr("explode(tags) as tags").show(false)
+---------------------------------------------+
|tags                                         |
+---------------------------------------------+
|[pos, 0, 5, NNP, [word -> Pierre], []]       |
|[pos, 7, 12, NNP, [word -> Vinken], []]      |
|[pos, 14, 14, ,, [word -> ,], []]            |
|[pos, 16, 17, CD, [word -> 61], []]          |
|[pos, 19, 23, NNS, [word -> years], []]      |
|[pos, 25, 27, JJ, [word -> old], []]         |
|[pos, 29, 29, ,, [word -> ,], []]            |
|[pos, 31, 34, MD, [word -> will], []]        |
|[pos, 36, 39, VB, [word -> join], []]        |
|[pos, 41, 43, DT, [word -> the], []]         |
|[pos, 45, 49, NN, [word -> board], []]       |
|[pos, 51, 52, IN, [word -> as], []]          |
|[pos, 47, 47, DT, [word -> a], []]           |
|[pos, 56, 67, JJ, [word -> nonexecutive], []]|
|[pos, 69, 76, NN, [word -> director], []]    |
|[pos, 78, 81, NNP, [word -> Nov.], []]       |
|[pos, 83, 84, CD, [word -> 29], []]          |
|[pos, 81, 81, ., [word -> .], []]            |
+---------------------------------------------+

case class PubTator() extends Product with Serializable

Value Members

object CoNLLHelper
object CoNLLUCols extends Enumeration

package training

Type Members

Example

class CoNLL2003NerReader extends AnyRef

case class CoNLLDocument(text: String, nerTagged: Seq[NerTaggedSentence], posTagged: Seq[PosTaggedSentence]) extends Product with Serializable

case class CoNLLU(explodeSentences: Boolean = true) extends Product with Serializable

case class CoNLLUDocument(text: String, uPosTagged: Seq[PosTaggedSentence], xPosTagged: Seq[PosTaggedSentence], lemma: Seq[PosTaggedSentence]) extends Product with Serializable

case class POS() extends Product with Serializable

Example

case class PubTator() extends Product with Serializable

Value Members

object CoNLLHelper

object CoNLLUCols extends Enumeration

Ungrouped