public abstract class BaseTextVectorizer extends Object implements TextVectorizer
| Modifier and Type | Field and Description |
|---|---|
protected int |
batchSize |
protected VocabCache |
cache |
protected boolean |
cleanup |
protected DocumentIterator |
docIter |
protected InvertedIndex |
index |
protected List<String> |
labels |
protected LabelAwareSentenceIterator |
labelSentenceIter |
protected int |
minWordFrequency |
protected AtomicLong |
numWordsEncountered |
protected double |
sample |
protected SentenceIterator |
sentenceIterator |
protected boolean |
stem |
protected List<String> |
stopWords |
protected TokenizerFactory |
tokenizerFactory |
protected akka.actor.ActorSystem |
trainingSystem |
| Modifier | Constructor and Description |
|---|---|
|
BaseTextVectorizer() |
protected |
BaseTextVectorizer(VocabCache cache,
TokenizerFactory tokenizerFactory,
List<String> stopWords,
int minWordFrequency,
DocumentIterator docIter,
SentenceIterator sentenceIterator,
List<String> labels,
InvertedIndex index,
int batchSize,
double sample,
boolean stem,
boolean cleanup) |
| Modifier and Type | Method and Description |
|---|---|
int |
batchSize()
For word vectors, this is the batch size for how to partition documents
in to workloads
|
void |
fit()
Train the model
|
VocabCache |
getCache() |
DocumentIterator |
getDocIter() |
int |
getMinWordFrequency() |
SentenceIterator |
getSentenceIterator() |
List<String> |
getStopWords() |
TokenizerFactory |
getTokenizerFactory() |
InvertedIndex |
index()
Inverted index
|
long |
numWordsEncountered()
Returns the number of words encountered so far
|
double |
sample()
Sampling for building mini batches
|
void |
setCache(VocabCache cache) |
void |
setDocIter(DocumentIterator docIter) |
void |
setMinWordFrequency(int minWordFrequency) |
void |
setSentenceIterator(SentenceIterator sentenceIterator) |
void |
setStopWords(List<String> stopWords) |
void |
setTokenizerFactory(TokenizerFactory tokenizerFactory) |
VocabCache |
vocab()
The vocab sorted in descending order
|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, waittransform, vectorize, vectorize, vectorizevectorizeprotected transient VocabCache cache
protected transient akka.actor.ActorSystem trainingSystem
protected transient TokenizerFactory tokenizerFactory
protected int minWordFrequency
protected transient DocumentIterator docIter
protected transient SentenceIterator sentenceIterator
protected transient LabelAwareSentenceIterator labelSentenceIter
protected AtomicLong numWordsEncountered
protected InvertedIndex index
protected int batchSize
protected double sample
protected boolean stem
protected boolean cleanup
public BaseTextVectorizer()
protected BaseTextVectorizer(VocabCache cache, TokenizerFactory tokenizerFactory, List<String> stopWords, int minWordFrequency, DocumentIterator docIter, SentenceIterator sentenceIterator, List<String> labels, InvertedIndex index, int batchSize, double sample, boolean stem, boolean cleanup)
public int batchSize()
TextVectorizerbatchSize in interface TextVectorizerpublic double sample()
TextVectorizersample in interface TextVectorizerpublic void fit()
TextVectorizerfit in interface TextVectorizerpublic VocabCache vocab()
TextVectorizervocab in interface TextVectorizerpublic SentenceIterator getSentenceIterator()
public void setSentenceIterator(SentenceIterator sentenceIterator)
public DocumentIterator getDocIter()
public void setDocIter(DocumentIterator docIter)
public int getMinWordFrequency()
public void setMinWordFrequency(int minWordFrequency)
public TokenizerFactory getTokenizerFactory()
public void setTokenizerFactory(TokenizerFactory tokenizerFactory)
public VocabCache getCache()
public void setCache(VocabCache cache)
public long numWordsEncountered()
TextVectorizernumWordsEncountered in interface TextVectorizerpublic InvertedIndex index()
TextVectorizerindex in interface TextVectorizerCopyright © 2015. All rights reserved.