com.johnsnowlabs.nlp.annotators.tokenizer.bpe

RobertaTokenizer

class RobertaTokenizer extends BpeTokenizer

Linear Supertypes

BpeTokenizer, AnyRef, Any

Ordering

Alphabetic
By Inheritance

Inherited

RobertaTokenizer
BpeTokenizer
AnyRef
Any

Hide All
Show All

Visibility

Public
All

Instance Constructors

new RobertaTokenizer(merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, padWithSentenceTokens: Boolean = false)

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
val appendForPieceId: Option[String]

Attributes
protected
Definition Classes
BpeTokenizer
final def asInstanceOf[T0]: T0

Definition Classes
Any
def bpe(indToken: IndexedToken): Array[TokenPiece]
Do the BPE algorithm.
Do the BPE algorithm. Goal is to find the token as the largest words in the known vocabulary. If not possible, the word is split into smaller subwords, until they are known.
returns
Array of TokenPieces, corresponding to encoded token

Attributes
protected
Definition Classes
BpeTokenizer
val bpeRanks: Map[(String, String), Int]

Attributes
protected
Definition Classes
BpeTokenizer
val cache: Map[String, Array[String]]
cache for already encoded tokens
cache for already encoded tokens

Attributes
protected
Definition Classes
BpeTokenizer
def clone(): AnyRef

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
def encode(indTokens: Array[IndexedToken]): Array[TokenPiece]

Definition Classes
BpeTokenizer
def encode(indToken: IndexedToken): Array[TokenPiece]

Definition Classes
BpeTokenizer
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
def getBpeRanking: ((String, String)) ⇒ Int
Rankings for the byte pairs.
Rankings for the byte pairs. Derived from merges.txt

Attributes
protected
Definition Classes
BpeTokenizer
def getBytePairs(word: Array[String]): Array[(String, String)]
Create a sequence of byte-pairs of the word TODO: XLM has to append to end
Create a sequence of byte-pairs of the word TODO: XLM has to append to end

Attributes
protected
Definition Classes
BpeTokenizer
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
Annotations
@native()
def getTokenPieces(indToken: IndexedToken, word: Array[String], processedToken: String): Array[TokenPiece]

Attributes
protected
Definition Classes
BpeTokenizer
def hashCode(): Int

Definition Classes
AnyRef → Any
Annotations
@native()
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
Annotations
@native()
final def notifyAll(): Unit

Definition Classes
AnyRef
Annotations
@native()
def performMerges(wordChars: Array[String], charPairs: Array[(String, String)]): Array[String]

Attributes
protected
Definition Classes
BpeTokenizer
def preProcessTokenForBpe(token: String): String

Definition Classes
RobertaTokenizer → BpeTokenizer
val prependForPieceId: Option[String]

Definition Classes
RobertaTokenizer → BpeTokenizer
val sentencePadding: (String, String)
Special tokens of the model for processing
Special tokens of the model for processing

Definition Classes
BpeTokenizer
def splitOnSpecialToken(specialToken: SpecialToken, text: String): ListBuffer[String]
Split the the individual sub texts on special tokens, e.g.
Split the the individual sub texts on special tokens, e.g. masking etc.

Attributes
protected
Definition Classes
BpeTokenizer
val splitPattern: Regex
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
def tokenize(sentence: Sentence): Array[IndexedToken]
Tokenize considering special tokens and split algorithm
Tokenize considering special tokens and split algorithm

Definition Classes
BpeTokenizer
def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]
Needs to be implemented
Needs to be implemented

Definition Classes
RobertaTokenizer → BpeTokenizer
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... ) @native()

Packages

RobertaTokenizer

class RobertaTokenizer extends BpeTokenizer

Instance Constructors

Value Members

Inherited from BpeTokenizer

Inherited from AnyRef

Inherited from Any

Ungrouped

Packages

RobertaTokenizer 

class RobertaTokenizer extends BpeTokenizer

Instance Constructors

Value Members

Inherited from BpeTokenizer

Inherited from AnyRef

Inherited from Any

Ungrouped

RobertaTokenizer