public abstract class BaseTextVectorizer extends Object implements TextVectorizer
Modifier and Type | Field and Description |
---|---|
protected int |
batchSize |
protected VocabCache |
cache |
protected DocumentIterator |
docIter |
protected InvertedIndex |
index |
protected List<String> |
labels |
protected int |
minWordFrequency |
protected AtomicLong |
numWordsEncountered |
protected double |
sample |
protected SentenceIterator |
sentenceIterator |
protected List<String> |
stopWords |
protected TokenizerFactory |
tokenizerFactory |
protected static akka.actor.ActorSystem |
trainingSystem |
Modifier | Constructor and Description |
---|---|
|
BaseTextVectorizer() |
protected |
BaseTextVectorizer(VocabCache cache,
TokenizerFactory tokenizerFactory,
List<String> stopWords,
int layerSize,
int minWordFrequency,
DocumentIterator docIter,
SentenceIterator sentenceIterator,
List<String> labels,
InvertedIndex index,
int batchSize,
double sample) |
Modifier and Type | Method and Description |
---|---|
int |
batchSize()
For word vectors, this is the batch size for how to partition documents
in to workloads
|
void |
fit()
Train the model
|
VocabCache |
getCache() |
DocumentIterator |
getDocIter() |
int |
getLayerSize() |
int |
getMinWordFrequency() |
SentenceIterator |
getSentenceIterator() |
List<String> |
getStopWords() |
TokenizerFactory |
getTokenizerFactory() |
InvertedIndex |
index()
Inverted index
|
long |
numWordsEncountered()
Returns the number of words encountered so far
|
double |
sample()
Sampling for building mini batches
|
void |
setCache(VocabCache cache) |
void |
setDocIter(DocumentIterator docIter) |
void |
setLayerSize(int layerSize) |
void |
setMinWordFrequency(int minWordFrequency) |
void |
setSentenceIterator(SentenceIterator sentenceIterator) |
void |
setStopWords(List<String> stopWords) |
void |
setTokenizerFactory(TokenizerFactory tokenizerFactory) |
VocabCache |
vocab()
The vocab sorted in descending order
|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
transform, vectorize, vectorize, vectorize
vectorize
protected transient VocabCache cache
protected static akka.actor.ActorSystem trainingSystem
protected transient TokenizerFactory tokenizerFactory
protected int minWordFrequency
protected transient DocumentIterator docIter
protected transient SentenceIterator sentenceIterator
protected AtomicLong numWordsEncountered
protected InvertedIndex index
protected int batchSize
protected double sample
public BaseTextVectorizer()
protected BaseTextVectorizer(VocabCache cache, TokenizerFactory tokenizerFactory, List<String> stopWords, int layerSize, int minWordFrequency, DocumentIterator docIter, SentenceIterator sentenceIterator, List<String> labels, InvertedIndex index, int batchSize, double sample)
public int batchSize()
TextVectorizer
batchSize
in interface TextVectorizer
public double sample()
TextVectorizer
sample
in interface TextVectorizer
public void fit()
TextVectorizer
fit
in interface TextVectorizer
public VocabCache vocab()
TextVectorizer
vocab
in interface TextVectorizer
public SentenceIterator getSentenceIterator()
public void setSentenceIterator(SentenceIterator sentenceIterator)
public DocumentIterator getDocIter()
public void setDocIter(DocumentIterator docIter)
public int getMinWordFrequency()
public void setMinWordFrequency(int minWordFrequency)
public int getLayerSize()
public void setLayerSize(int layerSize)
public TokenizerFactory getTokenizerFactory()
public void setTokenizerFactory(TokenizerFactory tokenizerFactory)
public VocabCache getCache()
public void setCache(VocabCache cache)
public long numWordsEncountered()
TextVectorizer
numWordsEncountered
in interface TextVectorizer
public InvertedIndex index()
TextVectorizer
index
in interface TextVectorizer
Copyright © 2014. All rights reserved.