public abstract class BaseTextVectorizer extends Object implements TextVectorizer
Modifier and Type | Field and Description |
---|---|
protected int |
batchSize |
protected VocabCache |
cache |
protected DocumentIterator |
docIter |
protected InvertedIndex |
index |
protected List<String> |
labels |
protected LabelAwareSentenceIterator |
labelSentenceIter |
protected int |
minWordFrequency |
protected AtomicLong |
numWordsEncountered |
protected double |
sample |
protected SentenceIterator |
sentenceIterator |
protected boolean |
stem |
protected List<String> |
stopWords |
protected TokenizerFactory |
tokenizerFactory |
protected akka.actor.ActorSystem |
trainingSystem |
Modifier | Constructor and Description |
---|---|
|
BaseTextVectorizer() |
protected |
BaseTextVectorizer(VocabCache cache,
TokenizerFactory tokenizerFactory,
List<String> stopWords,
int minWordFrequency,
DocumentIterator docIter,
SentenceIterator sentenceIterator,
List<String> labels,
InvertedIndex index,
int batchSize,
double sample,
boolean stem,
boolean cleanup) |
Modifier and Type | Method and Description |
---|---|
int |
batchSize()
For word vectors, this is the batch size for how to partition documents
in to workloads
|
void |
fit()
Train the model
|
VocabCache |
getCache() |
DocumentIterator |
getDocIter() |
int |
getMinWordFrequency() |
SentenceIterator |
getSentenceIterator() |
List<String> |
getStopWords() |
TokenizerFactory |
getTokenizerFactory() |
InvertedIndex |
index()
Inverted index
|
long |
numWordsEncountered()
Returns the number of words encountered so far
|
double |
sample()
Sampling for building mini batches
|
void |
setCache(VocabCache cache) |
void |
setDocIter(DocumentIterator docIter) |
void |
setMinWordFrequency(int minWordFrequency) |
void |
setSentenceIterator(SentenceIterator sentenceIterator) |
void |
setStopWords(List<String> stopWords) |
void |
setTokenizerFactory(TokenizerFactory tokenizerFactory) |
VocabCache |
vocab()
The vocab sorted in descending order
|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
transform, vectorize, vectorize, vectorize
vectorize
protected transient VocabCache cache
protected transient akka.actor.ActorSystem trainingSystem
protected transient TokenizerFactory tokenizerFactory
protected int minWordFrequency
protected transient DocumentIterator docIter
protected transient SentenceIterator sentenceIterator
protected transient LabelAwareSentenceIterator labelSentenceIter
protected AtomicLong numWordsEncountered
protected InvertedIndex index
protected int batchSize
protected double sample
protected boolean stem
public BaseTextVectorizer()
protected BaseTextVectorizer(VocabCache cache, TokenizerFactory tokenizerFactory, List<String> stopWords, int minWordFrequency, DocumentIterator docIter, SentenceIterator sentenceIterator, List<String> labels, InvertedIndex index, int batchSize, double sample, boolean stem, boolean cleanup)
public int batchSize()
TextVectorizer
batchSize
in interface TextVectorizer
public double sample()
TextVectorizer
sample
in interface TextVectorizer
public void fit()
TextVectorizer
fit
in interface TextVectorizer
public VocabCache vocab()
TextVectorizer
vocab
in interface TextVectorizer
public SentenceIterator getSentenceIterator()
public void setSentenceIterator(SentenceIterator sentenceIterator)
public DocumentIterator getDocIter()
public void setDocIter(DocumentIterator docIter)
public int getMinWordFrequency()
public void setMinWordFrequency(int minWordFrequency)
public TokenizerFactory getTokenizerFactory()
public void setTokenizerFactory(TokenizerFactory tokenizerFactory)
public VocabCache getCache()
public void setCache(VocabCache cache)
public long numWordsEncountered()
TextVectorizer
numWordsEncountered
in interface TextVectorizer
public InvertedIndex index()
TextVectorizer
index
in interface TextVectorizer
Copyright © 2015. All Rights Reserved.