case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false) extends Product with Serializable
Helper class to load a CoNLL type dataset for training.
The dataset should be in the format of
CoNLL 2003 and needs to be specified with
readDataset. Other CoNLL datasets are not supported.
Two types of input paths are supported,
Folder: this is a path ending in *, and representing a collection of CoNLL files within a
directory. E.g., 'path/to/multiple/conlls/*' Using this pattern will result in all the
files being read into a single Dataframe. Some constraints apply on the schemas of the
multiple files.
File: this is a path to a single file. E.g., 'path/to/single_file.conll'
Example
val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train") trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label") .show(3, false) +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ |text |tokens |pos |label | +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ |EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]| |Peter Blackburn |[Peter, Blackburn] |[NNP, NNP] |[B-PER, I-PER] | |BRUSSELS 1996-08-22 |[BRUSSELS, 1996-08-22] |[NNP, CD] |[B-LOC, O] | +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ trainingData.printSchema root |-- text: string (nullable = true) |-- document: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- sentence: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- token: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- pos: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- label: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false)
- documentCol
Name of the
DOCUMENTAnnotator type column- sentenceCol
Name of the Sentences of
DOCUMENTAnnotator type column- tokenCol
Name of the
TOKENAnnotator type column- posCol
Name of the
POSAnnotator type column- conllLabelIndex
Index of the column for NER Label in the dataset
- conllPosIndex
Index of the column for the POS tags in the dataset
- conllDocIdCol
Name of the column for the text in the dataset
- conllTextCol
Name of the column for the text in the dataset
- labelCol
Name of the
NAMED_ENTITYAnnotator type column- explodeSentences
Whether to explode each sentence to a separate row
- delimiter
Delimiter used to separate columns inside CoNLL file
- includeDocId
Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
- Alphabetic
- By Inheritance
- CoNLL
- Serializable
- Product
- Equals
- AnyRef
- Any
- Hide All
- Show All
- Public
- Protected
Instance Constructors
- new CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false)
- documentCol
Name of the
DOCUMENTAnnotator type column- sentenceCol
Name of the Sentences of
DOCUMENTAnnotator type column- tokenCol
Name of the
TOKENAnnotator type column- posCol
Name of the
POSAnnotator type column- conllLabelIndex
Index of the column for NER Label in the dataset
- conllPosIndex
Index of the column for the POS tags in the dataset
- conllDocIdCol
Name of the column for the text in the dataset
- conllTextCol
Name of the column for the text in the dataset
- labelCol
Name of the
NAMED_ENTITYAnnotator type column- explodeSentences
Whether to explode each sentence to a separate row
- delimiter
Delimiter used to separate columns inside CoNLL file
- includeDocId
Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
Value Members
- final def !=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- final def ##: Int
- Definition Classes
- AnyRef → Any
- final def ==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- val annotationType: ArrayType
- final def asInstanceOf[T0]: T0
- Definition Classes
- Any
- def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]
- def clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.CloneNotSupportedException]) @HotSpotIntrinsicCandidate() @native()
- val conllDocIdCol: String
- val conllLabelIndex: Int
- val conllPosIndex: Int
- val conllTextCol: String
- val delimiter: String
- val documentCol: String
- final def eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- val explodeSentences: Boolean
- def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
- final def getClass(): Class[_ <: AnyRef]
- Definition Classes
- AnyRef → Any
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- val includeDocId: Boolean
- final def isInstanceOf[T0]: Boolean
- Definition Classes
- Any
- val labelCol: String
- final def ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- final def notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- final def notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @HotSpotIntrinsicCandidate() @native()
- def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
- def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]
- def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]
- def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
- def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
- def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
- val posCol: String
- def productElementNames: Iterator[String]
- Definition Classes
- Product
- def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString, parallelism: Int = 8, storageLevel: StorageLevel = StorageLevel.DISK_ONLY): Dataset[_]
- def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
- def readDocs(er: ExternalResource): Seq[CoNLLDocument]
- def readLines(lines: Array[String]): Seq[CoNLLDocument]
- def removeSurroundingHyphens(text: String): String
- def schema: StructType
- val sentenceCol: String
- final def synchronized[T0](arg0: => T0): T0
- Definition Classes
- AnyRef
- val tokenCol: String
- final def wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException])
- final def wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException]) @native()
- final def wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.InterruptedException])
Deprecated Value Members
- def finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws(classOf[java.lang.Throwable]) @Deprecated
- Deprecated
(Since version 9)