com.johnsnowlabs.nlp.annotators.tokenizer.bpe

JanusTokenizer

class JanusTokenizer extends BpeTokenizer

Linear Supertypes

BpeTokenizer, AnyRef, Any

Ordering

Alphabetic
By Inheritance

Inherited

JanusTokenizer
BpeTokenizer
AnyRef
Any

Hide All
Show All

Visibility

Public
Protected

Instance Constructors

new JanusTokenizer(merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, padWithSequenceTokens: Boolean = true, prependString: String = "", addPrefixSpaceToSentence: Boolean = false, alwaysAddPrefix: Boolean = true, splitPatternRegex: Regex = raw"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""".r)

Value Members

final def !=(arg0: Any): Boolean
Definition Classes
AnyRef → Any
final def ##: Int
Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean
Definition Classes
AnyRef → Any
val addPrefixSpaceToSentence: Boolean
Definition Classes
BpeTokenizer
val alwaysAddPrefix: Boolean
Definition Classes
BpeTokenizer
final def asInstanceOf[T0]: T0
Definition Classes
Any
def bpe(indToken: IndexedToken): Array[TokenPiece]
Do the BPE algorithm.
Do the BPE algorithm. Goal is to find the token as the largest words in the known vocabulary. If not possible, the word is split into smaller subwords, until they are known.
returns
Array of TokenPieces, corresponding to encoded token
Attributes
protected
Definition Classes
BpeTokenizer
val bpeRanks: Map[(String, String), Int]
Attributes
protected
Definition Classes
BpeTokenizer
val bytesToUnicodeMapping: Map[Int, String]
Mapping for bytes to a different set of unicode characters (especially white spaces).
Mapping for bytes to a different set of unicode characters (especially white spaces). This improved model performance for gpt-2
Attributes
protected
val cache: Map[String, Array[String]]
cache for already encoded tokens
cache for already encoded tokens
Attributes
protected
Definition Classes
BpeTokenizer
def clone(): AnyRef
Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.CloneNotSupportedException]) @HotSpotIntrinsicCandidate() @native()
def decodeTokens(tokens: Array[Int]): String
val decoderVocab: Map[Int, String]
Attributes
protected
def encode(indTokens: Array[IndexedToken]): Array[TokenPiece]
Definition Classes
BpeTokenizer
def encode(indToken: IndexedToken): Array[TokenPiece]
Definition Classes
BpeTokenizer
final def eq(arg0: AnyRef): Boolean
Definition Classes
AnyRef
def equals(arg0: AnyRef): Boolean
Definition Classes
AnyRef → Any
def getBpeRanking: ((String, String)) => Int
Rankings for the byte pairs.
Rankings for the byte pairs. Derived from merges.txt
Attributes
protected
Definition Classes
BpeTokenizer
def getBytePairs(word: Array[String]): Array[(String, String)]
Create a sequence of byte-pairs of the word
Create a sequence of byte-pairs of the word
Attributes
protected
Definition Classes
BpeTokenizer
final def getClass(): Class[_ <: AnyRef]
Definition Classes
AnyRef → Any
Annotations
@HotSpotIntrinsicCandidate() @native()
def getTokenPieces(indToken: IndexedToken, word: Array[String]): Array[TokenPiece]
Attributes
protected
Definition Classes
BpeTokenizer
def hashCode(): Int
Definition Classes
AnyRef → Any
Annotations
@HotSpotIntrinsicCandidate() @native()
final def isInstanceOf[T0]: Boolean
Definition Classes
Any
val merges: Map[(String, String), Int]
Definition Classes
BpeTokenizer
final def ne(arg0: AnyRef): Boolean
Definition Classes
AnyRef
final def notify(): Unit
Definition Classes
AnyRef
Annotations
@HotSpotIntrinsicCandidate() @native()
final def notifyAll(): Unit
Definition Classes
AnyRef
Annotations
@HotSpotIntrinsicCandidate() @native()
val padWithSequenceTokens: Boolean
Definition Classes
BpeTokenizer
def performMerges(wordChars: Array[String], charPairs: Array[(String, String)]): Array[String]
Attributes
protected
Definition Classes
BpeTokenizer
def preProcessTokenForBpe(token: String): String
Definition Classes
JanusTokenizer → BpeTokenizer
val prefixForPieceId: Option[String]
Definition Classes
JanusTokenizer → BpeTokenizer
val sentencePadding: (String, String)
Special tokens of the model for processing
Special tokens of the model for processing
Definition Classes
BpeTokenizer
val specialTokens: SpecialTokens
Definition Classes
BpeTokenizer
def splitOnSpecialToken(specialToken: SpecialToken, text: String): ListBuffer[String]
Split the the individual sub texts on special tokens, e.g.
Split the the individual sub texts on special tokens, e.g. masking etc.
Attributes
protected
Definition Classes
BpeTokenizer
val splitPattern: Regex
val suffixForPieceId: Option[String]
Attributes
protected
Definition Classes
BpeTokenizer
final def synchronized[T0](arg0: => T0): T0
Definition Classes
AnyRef
def toString(): String
Definition Classes
AnyRef → Any
def tokenize(sentence: Sentence): Array[IndexedToken]
Tokenize considering special tokens and split algorithm
Tokenize considering special tokens and split algorithm
Definition Classes
BpeTokenizer
def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]
Needs to be implemented
Needs to be implemented
Definition Classes
JanusTokenizer → BpeTokenizer
val unicodeToByteMapping: Map[String, Int]
Attributes
protected
val vocab: Map[String, Int]
Definition Classes
BpeTokenizer
final def wait(arg0: Long, arg1: Int): Unit
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.InterruptedException])
final def wait(arg0: Long): Unit
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.InterruptedException]) @native()
final def wait(): Unit
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.InterruptedException])

Deprecated Value Members

def finalize(): Unit
Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.Throwable]) @Deprecated
Deprecated
(Since version 9)

Packages

JanusTokenizer

class JanusTokenizer extends BpeTokenizer

Instance Constructors

Value Members

Deprecated Value Members

Inherited from BpeTokenizer

Inherited from AnyRef

Inherited from Any

Ungrouped

Packages

JanusTokenizer

class JanusTokenizer extends BpeTokenizer

Instance Constructors

Value Members

Deprecated Value Members

Inherited from BpeTokenizer

Inherited from AnyRef

Inherited from Any

Ungrouped

JanusTokenizer