com.johnsnowlabs.nlp.annotators.tokenizer.bpe

CLIPTokenizer

class CLIPTokenizer extends Gpt2Tokenizer

Linear Supertypes

Gpt2Tokenizer, BpeTokenizer, AnyRef, Any

Ordering

Alphabetic
By Inheritance

Inherited

CLIPTokenizer
Gpt2Tokenizer
BpeTokenizer
AnyRef
Any

Hide All
Show All

Visibility

Public
Protected

Instance Constructors

new CLIPTokenizer(merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens)

Value Members

final def !=(arg0: Any): Boolean
Definition Classes
AnyRef → Any
final def ##: Int
Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean
Definition Classes
AnyRef → Any
val addPrefixSpaceToSentence: Boolean
Definition Classes
BpeTokenizer
val alwaysAddPrefix: Boolean
Definition Classes
BpeTokenizer
final def asInstanceOf[T0]: T0
Definition Classes
Any
def bpe(indToken: IndexedToken): Array[TokenPiece]
CLIP Specific tokenization.
CLIP Specific tokenization. We append "<\w>" to word ends.
returns
Array of TokenPieces, corresponding to encoded token
Attributes
protected
Definition Classes
CLIPTokenizer → BpeTokenizer
val bpeRanks: Map[(String, String), Int]
Attributes
protected
Definition Classes
BpeTokenizer
val bytesToUnicodeMapping: Map[Int, String]
Mapping for bytes to a different set of unicode characters (especially white spaces).
Mapping for bytes to a different set of unicode characters (especially white spaces). This improved model performance for gpt-2
Attributes
protected
Definition Classes
Gpt2Tokenizer
val cache: Map[String, Array[String]]
cache for already encoded tokens
cache for already encoded tokens
Attributes
protected
Definition Classes
BpeTokenizer
def clone(): AnyRef
Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.CloneNotSupportedException]) @HotSpotIntrinsicCandidate() @native()
def decodeTokens(tokens: Array[Int]): String
Definition Classes
Gpt2Tokenizer
val decoderVocab: Map[Int, String]
Attributes
protected
Definition Classes
Gpt2Tokenizer
def encode(indTokens: Array[IndexedToken]): Array[TokenPiece]
Definition Classes
BpeTokenizer
def encode(indToken: IndexedToken): Array[TokenPiece]
Definition Classes
BpeTokenizer
final def eq(arg0: AnyRef): Boolean
Definition Classes
AnyRef
def equals(arg0: AnyRef): Boolean
Definition Classes
AnyRef → Any
def getBpeRanking: ((String, String)) => Int
Rankings for the byte pairs.
Rankings for the byte pairs. Derived from merges.txt
Attributes
protected
Definition Classes
BpeTokenizer
def getBytePairs(word: Array[String]): Array[(String, String)]
Create a sequence of byte-pairs of the word
Create a sequence of byte-pairs of the word
Attributes
protected
Definition Classes
BpeTokenizer
final def getClass(): Class[_ <: AnyRef]
Definition Classes
AnyRef → Any
Annotations
@HotSpotIntrinsicCandidate() @native()
def getTokenPieces(indToken: IndexedToken, word: Array[String]): Array[TokenPiece]
Attributes
protected
Definition Classes
BpeTokenizer
def hashCode(): Int
Definition Classes
AnyRef → Any
Annotations
@HotSpotIntrinsicCandidate() @native()
final def isInstanceOf[T0]: Boolean
Definition Classes
Any
val merges: Map[(String, String), Int]
Definition Classes
BpeTokenizer
final def ne(arg0: AnyRef): Boolean
Definition Classes
AnyRef
final def notify(): Unit
Definition Classes
AnyRef
Annotations
@HotSpotIntrinsicCandidate() @native()
final def notifyAll(): Unit
Definition Classes
AnyRef
Annotations
@HotSpotIntrinsicCandidate() @native()
val padWithSequenceTokens: Boolean
Definition Classes
BpeTokenizer
def performMerges(wordChars: Array[String], charPairs: Array[(String, String)]): Array[String]
Attributes
protected
Definition Classes
BpeTokenizer
def preProcessTokenForBpe(token: String): String
Definition Classes
Gpt2Tokenizer → BpeTokenizer
val prefixForPieceId: Option[String]
Definition Classes
Gpt2Tokenizer → BpeTokenizer
val sentencePadding: (String, String)
Special tokens of the model for processing
Special tokens of the model for processing
Definition Classes
BpeTokenizer
val specialTokens: SpecialTokens
Definition Classes
BpeTokenizer
def splitOnSpecialToken(specialToken: SpecialToken, text: String): ListBuffer[String]
Split the the individual sub texts on special tokens, e.g.
Split the the individual sub texts on special tokens, e.g. masking etc.
Attributes
protected
Definition Classes
BpeTokenizer
val splitPattern: Regex
Definition Classes
CLIPTokenizer → Gpt2Tokenizer
val suffixForPieceId: Option[String]
Attributes
protected
Definition Classes
BpeTokenizer
final def synchronized[T0](arg0: => T0): T0
Definition Classes
AnyRef
def toString(): String
Definition Classes
AnyRef → Any
def tokenize(sentence: Sentence): Array[IndexedToken]
Tokenize considering special tokens and split algorithm
Tokenize considering special tokens and split algorithm
Definition Classes
BpeTokenizer
def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]
Needs to be implemented
Needs to be implemented
Definition Classes
CLIPTokenizer → Gpt2Tokenizer → BpeTokenizer
val unicodeToByteMapping: Map[String, Int]
Attributes
protected
Definition Classes
Gpt2Tokenizer
val vocab: Map[String, Int]
Definition Classes
BpeTokenizer
final def wait(arg0: Long, arg1: Int): Unit
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.InterruptedException])
final def wait(arg0: Long): Unit
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.InterruptedException]) @native()
final def wait(): Unit
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.InterruptedException])

Deprecated Value Members

def finalize(): Unit
Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws(classOf[java.lang.Throwable]) @Deprecated
Deprecated
(Since version 9)

Packages

CLIPTokenizer

class CLIPTokenizer extends Gpt2Tokenizer

Instance Constructors

Value Members

Deprecated Value Members

Inherited from Gpt2Tokenizer

Inherited from BpeTokenizer

Inherited from AnyRef

Inherited from Any

Ungrouped

Packages

CLIPTokenizer

class CLIPTokenizer extends Gpt2Tokenizer

Instance Constructors

Value Members

Deprecated Value Members

Inherited from Gpt2Tokenizer

Inherited from BpeTokenizer

Inherited from AnyRef

Inherited from Any

Ungrouped

CLIPTokenizer