ner

Type Members

case class NamedEntity(start: Int, end: Int, entity: String, text: String, sentenceId: String, confidence: Option[Float]) extends Product with Serializable
trait NerApproach[T <: NerApproach[_]] extends Params

class NerConverter extends AnnotatorModel[NerConverter] with HasSimpleAnnotate[NerConverter]

Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label.

Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label. Results in CHUNK Annotation type.

NER chunks can then be filtered by setting a whitelist with setWhiteList. Chunks with no associated entity (tagged "O") are filtered.

See also Inside–outside–beginning (tagging) for more information.

Example

This is a continuation of the example of the NerDLModel. See that class on how to extract the entities.

The output of the NerDLModel follows the Annotator schema and can be converted like so:

result.selectExpr("explode(ner)").show(false)
+----------------------------------------------------+
|col                                                 |
+----------------------------------------------------+
|[named_entity, 0, 2, B-ORG, [word -> U.N], []]      |
|[named_entity, 3, 3, O, [word -> .], []]            |
|[named_entity, 5, 12, O, [word -> official], []]    |
|[named_entity, 14, 18, B-PER, [word -> Ekeus], []]  |
|[named_entity, 20, 24, O, [word -> heads], []]      |
|[named_entity, 26, 28, O, [word -> for], []]        |
|[named_entity, 30, 36, B-LOC, [word -> Baghdad], []]|
|[named_entity, 37, 37, O, [word -> .], []]          |
+----------------------------------------------------+

After the converter is used:

val converter = new NerConverter()
  .setInputCols("sentence", "token", "ner")
  .setOutputCol("entities")
  .setPreservePosition(false)

converter.transform(result).selectExpr("explode(entities)").show(false)
+------------------------------------------------------------------------+
|col                                                                     |
+------------------------------------------------------------------------+
|[chunk, 0, 2, U.N, [entity -> ORG, sentence -> 0, chunk -> 0], []]      |
|[chunk, 14, 18, Ekeus, [entity -> PER, sentence -> 0, chunk -> 1], []]  |
|[chunk, 30, 36, Baghdad, [entity -> LOC, sentence -> 0, chunk -> 2], []]|
+------------------------------------------------------------------------+

class NerOverwriter extends AnnotatorModel[NerOverwriter] with HasSimpleAnnotate[NerOverwriter]

Overwrites entities of specified strings.

The input for this Annotator have to be entities that are already extracted, Annotator type NAMED_ENTITY. The strings specified with setStopWords will have new entities assigned to, specified with setNewResult.

Example

import spark.implicits._
import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
import com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel
import com.johnsnowlabs.nlp.annotators.ner.NerOverwriter
import org.apache.spark.ml.Pipeline

// First extract the prerequisite Entities
val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentence = new SentenceDetector()
  .setInputCols("document")
  .setOutputCol("sentence")

val tokenizer = new Tokenizer()
  .setInputCols("sentence")
  .setOutputCol("token")

val embeddings = WordEmbeddingsModel.pretrained()
  .setInputCols("sentence", "token")
  .setOutputCol("embeddings")

val nerTagger = NerDLModel.pretrained()
  .setInputCols("sentence", "token", "embeddings")
  .setOutputCol("ner")

val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  sentence,
  tokenizer,
  embeddings,
  nerTagger
))

val data = Seq("Spark NLP Crosses Five Million Downloads, John Snow Labs Announces.").toDF("text")
val result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner)").show(false)
/*
+------------------------------------------------------+
|col                                                   |
+------------------------------------------------------+
|[named_entity, 0, 4, B-ORG, [word -> Spark], []]      |
|[named_entity, 6, 8, I-ORG, [word -> NLP], []]        |
|[named_entity, 10, 16, O, [word -> Crosses], []]      |
|[named_entity, 18, 21, O, [word -> Five], []]         |
|[named_entity, 23, 29, O, [word -> Million], []]      |
|[named_entity, 31, 39, O, [word -> Downloads], []]    |
|[named_entity, 40, 40, O, [word -> ,], []]            |
|[named_entity, 42, 45, B-ORG, [word -> John], []]     |
|[named_entity, 47, 50, I-ORG, [word -> Snow], []]     |
|[named_entity, 52, 55, I-ORG, [word -> Labs], []]     |
|[named_entity, 57, 65, I-ORG, [word -> Announces], []]|
|[named_entity, 66, 66, O, [word -> .], []]            |
+------------------------------------------------------+
*/
// The recognized entities can then be overwritten
val nerOverwriter = new NerOverwriter()
  .setInputCols("ner")
  .setOutputCol("ner_overwritten")
  .setStopWords(Array("Million"))
  .setNewResult("B-CARDINAL")

nerOverwriter.transform(result).selectExpr("explode(ner_overwritten)").show(false)
+---------------------------------------------------------+
|col                                                      |
+---------------------------------------------------------+
|[named_entity, 0, 4, B-ORG, [word -> Spark], []]         |
|[named_entity, 6, 8, I-ORG, [word -> NLP], []]           |
|[named_entity, 10, 16, O, [word -> Crosses], []]         |
|[named_entity, 18, 21, O, [word -> Five], []]            |
|[named_entity, 23, 29, B-CARDINAL, [word -> Million], []]|
|[named_entity, 31, 39, O, [word -> Downloads], []]       |
|[named_entity, 40, 40, O, [word -> ,], []]               |
|[named_entity, 42, 45, B-ORG, [word -> John], []]        |
|[named_entity, 47, 50, I-ORG, [word -> Snow], []]        |
|[named_entity, 52, 55, I-ORG, [word -> Labs], []]        |
|[named_entity, 57, 65, I-ORG, [word -> Announces], []]   |
|[named_entity, 66, 66, O, [word -> .], []]               |
+---------------------------------------------------------+

Value Members

object NerConverter extends ParamsAndFeaturesReadable[NerConverter] with Serializable
object NerOverwriter extends DefaultParamsReadable[NerOverwriter] with Serializable

This is the companion object of NerOverwriter.
This is the companion object of NerOverwriter. Please refer to that class for the documentation.
object NerTagsEncoding

Works with different NER representations as tags Supports: IOB and IOB2 https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
object Verbose extends Enumeration
package crf
package dl

package ner

Type Members

case class NamedEntity(start: Int, end: Int, entity: String, text: String, sentenceId: String, confidence: Option[Float]) extends Product with Serializable

trait NerApproach[T <: NerApproach[_]] extends Params

class NerConverter extends AnnotatorModel[NerConverter] with HasSimpleAnnotate[NerConverter]

Example

class NerOverwriter extends AnnotatorModel[NerOverwriter] with HasSimpleAnnotate[NerOverwriter]

Example

Value Members

object NerConverter extends ParamsAndFeaturesReadable[NerConverter] with Serializable

object NerOverwriter extends DefaultParamsReadable[NerOverwriter] with Serializable

object NerTagsEncoding

object Verbose extends Enumeration

package crf

package dl

Ungrouped