RobertaTokenizer

Instance Constructors

new RobertaTokenizer(merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, padWithSentenceTokens: Boolean = false)

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
val appendForPieceId: Option[String]

Attributes
protected
Definition Classes
BpeTokenizer
final def asInstanceOf[T0]: T0

Definition Classes
Any
def bpe(indToken: IndexedToken): Array[TokenPiece]

Do the BPE algorithm.
Do the BPE algorithm. Goal is to find the token as the largest words in the known vocabulary. If not possible, the word is split into smaller subwords, until they are known.
returns
Array of TokenPieces, corresponding to encoded token

Attributes
protected
Definition Classes
BpeTokenizer
val bpeRanks: Map[(String, String), Int]

Attributes
protected
Definition Classes
BpeTokenizer
val cache: Map[String, Array[String]]

cache for already encoded tokens
cache for already encoded tokens

Attributes
protected
Definition Classes
BpeTokenizer
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
def decodeTokens(tokens: Array[Int]): String

Definition Classes
Gpt2Tokenizer
def encode(indTokens: Array[IndexedToken]): Array[TokenPiece]

Definition Classes
BpeTokenizer
def encode(indToken: IndexedToken): Array[TokenPiece]

Definition Classes
BpeTokenizer
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
def getBpeRanking: ((String, String)) ⇒ Int

Rankings for the byte pairs.
Rankings for the byte pairs. Derived from merges.txt

Attributes
protected
Definition Classes
BpeTokenizer
def getBytePairs(word: Array[String]): Array[(String, String)]

Create a sequence of byte-pairs of the word
Create a sequence of byte-pairs of the word

Attributes
protected
Definition Classes
BpeTokenizer
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def getTokenPieces(indToken: IndexedToken, word: Array[String], processedToken: String): Array[TokenPiece]

Attributes
protected
Definition Classes
BpeTokenizer
def hashCode(): Int

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
val merges: Map[(String, String), Int]

Definition Classes
BpeTokenizer
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
var padWithSentenceTokens: Boolean

Definition Classes
BpeTokenizer
def performMerges(wordChars: Array[String], charPairs: Array[(String, String)]): Array[String]

Attributes
protected
Definition Classes
BpeTokenizer
def preProcessTokenForBpe(token: String): String

Definition Classes
Gpt2Tokenizer → BpeTokenizer
val prependForPieceId: Option[String]

Definition Classes
Gpt2Tokenizer → BpeTokenizer
val sentencePadding: (String, String)

Special tokens of the model for processing
Special tokens of the model for processing

Definition Classes
BpeTokenizer
val specialTokens: SpecialTokens

Definition Classes
BpeTokenizer
def splitOnSpecialToken(specialToken: SpecialToken, text: String): ListBuffer[String]

Split the the individual sub texts on special tokens, e.g.
Split the the individual sub texts on special tokens, e.g. masking etc.

Attributes
protected
Definition Classes
BpeTokenizer
val splitPattern: Regex

Definition Classes
Gpt2Tokenizer
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
def tokenize(sentence: Sentence): Array[IndexedToken]

Tokenize considering special tokens and split algorithm
Tokenize considering special tokens and split algorithm

Definition Classes
BpeTokenizer
def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]

Needs to be implemented
Needs to be implemented

Definition Classes
Gpt2Tokenizer → BpeTokenizer
val vocab: Map[String, Int]

Definition Classes
BpeTokenizer
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Doc: package bpe

class RobertaTokenizer extends Gpt2Tokenizer

Instance Constructors

new RobertaTokenizer(merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, padWithSentenceTokens: Boolean = false)

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

val appendForPieceId: Option[String]

final def asInstanceOf[T0]: T0

def bpe(indToken: IndexedToken): Array[TokenPiece]

val bpeRanks: Map[(String, String), Int]

val cache: Map[String, Array[String]]

def clone(): AnyRef

def decodeTokens(tokens: Array[Int]): String

def encode(indTokens: Array[IndexedToken]): Array[TokenPiece]

def encode(indToken: IndexedToken): Array[TokenPiece]

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

def getBpeRanking: ((String, String)) ⇒ Int

def getBytePairs(word: Array[String]): Array[(String, String)]

final def getClass(): Class[_]

def getTokenPieces(indToken: IndexedToken, word: Array[String], processedToken: String): Array[TokenPiece]

def hashCode(): Int

final def isInstanceOf[T0]: Boolean

val merges: Map[(String, String), Int]

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

var padWithSentenceTokens: Boolean

def performMerges(wordChars: Array[String], charPairs: Array[(String, String)]): Array[String]

def preProcessTokenForBpe(token: String): String

val prependForPieceId: Option[String]

val sentencePadding: (String, String)

val specialTokens: SpecialTokens

def splitOnSpecialToken(specialToken: SpecialToken, text: String): ListBuffer[String]

val splitPattern: Regex

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

def tokenize(sentence: Sentence): Array[IndexedToken]

def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]

val vocab: Map[String, Int]

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Gpt2Tokenizer

Inherited from BpeTokenizer

Inherited from AnyRef

Inherited from Any

Ungrouped