public class Word2Vec extends WordVectorsImpl
Modifier and Type | Class and Description |
---|---|
static class |
Word2Vec.Builder |
Modifier and Type | Field and Description |
---|---|
protected com.google.common.util.concurrent.AtomicDouble |
alpha |
protected int |
batchSize |
protected Word2VecConfiguration |
configuration |
protected DocumentIterator |
docIter |
protected org.apache.commons.math3.random.RandomGenerator |
g |
protected InvertedIndex |
invertedIndex |
protected int |
learningRateDecayWords |
protected static org.slf4j.Logger |
log |
protected double |
minLearningRate |
protected int |
numIterations |
protected boolean |
resetModel |
protected double |
sample |
protected boolean |
saveVocab |
protected long |
seed |
protected SentenceIterator |
sentenceIter |
protected static long |
serialVersionUID |
protected boolean |
shouldReset |
protected TokenizerFactory |
tokenizerFactory |
protected long |
totalWords |
static String |
UNK |
protected boolean |
useAdaGrad |
protected TextVectorizer |
vectorizer |
protected VocabularyHolder |
vocabularyHolder |
protected int |
window |
protected int |
workers |
layerSize, lookupTable, minWordFrequency, stopWords, vocab
Constructor and Description |
---|
Word2Vec() |
Modifier and Type | Method and Description |
---|---|
protected void |
addWords(List<VocabWord> sentence,
AtomicLong nextRandom,
List<VocabWord> currMiniBatch) |
protected void |
buildBinaryTree() |
boolean |
buildVocab()
Builds the vocabulary for training
|
protected List<VocabWord> |
digitizeSentence(List<String> tokens)
Returns sentence as list of word from vocabulary.
|
VocabCache |
fillSpecialVocabulary(SentenceIterator iterator,
int minWord)
This method can be used to build vocabulary from special source, that should be treated separately.
|
protected int |
fillVocabulary(List<String> tokens)
This method adds all unknown words to vocabulary.
|
void |
fit()
Train the model
|
SentenceIterator |
getSentenceIter() |
List<String> |
getStopWords() |
TokenizerFactory |
getTokenizerFactory() |
TextVectorizer |
getVectorizer() |
int |
getWindow() |
void |
iterate(VocabWord w1,
VocabWord w2,
AtomicLong nextRandom,
double alpha)
Train the word vector
on the given words
|
protected void |
readStopWords() |
protected void |
resetWeights() |
void |
resetWeightsOnSetup()
restart training on next fit().
|
void |
setSentenceIter(SentenceIterator sentenceIter)
Note that calling a setter on this
means assumes that this is a training continuation
and therefore weights should not be reset.
|
void |
setTokenizerFactory(TokenizerFactory tokenizerFactory) |
void |
setup()
Build the binary tree
Reset the weights
|
void |
setVectorizer(TextVectorizer vectorizer) |
void |
skipGram(int i,
List<VocabWord> sentence,
int b,
AtomicLong nextRandom,
double alpha)
Train via skip gram
|
void |
trainSentence(List<VocabWord> sentence,
AtomicLong nextRandom,
double alpha)
Train on a list of vocab words
|
accuracy, getWordVector, getWordVectorMatrix, getWordVectorMatrixNormalized, hasWord, indexOf, lookupTable, setLookupTable, setVocab, similarity, similarWordsInVocabTo, vocab, wordsNearest, wordsNearest, wordsNearest, wordsNearestSum, wordsNearestSum, wordsNearestSum
protected static final long serialVersionUID
protected transient Word2VecConfiguration configuration
protected transient TokenizerFactory tokenizerFactory
protected transient SentenceIterator sentenceIter
protected transient DocumentIterator docIter
protected transient TextVectorizer vectorizer
protected transient InvertedIndex invertedIndex
protected transient VocabularyHolder vocabularyHolder
protected transient org.apache.commons.math3.random.RandomGenerator g
protected transient int workers
protected int batchSize
protected double sample
protected long totalWords
protected com.google.common.util.concurrent.AtomicDouble alpha
protected int window
protected static final org.slf4j.Logger log
protected boolean shouldReset
protected int numIterations
public static final String UNK
protected long seed
protected boolean saveVocab
protected double minLearningRate
protected int learningRateDecayWords
protected boolean useAdaGrad
protected boolean resetModel
public TextVectorizer getVectorizer()
public void setVectorizer(TextVectorizer vectorizer)
protected int fillVocabulary(List<String> tokens)
tokens
- list of strings received from Tokenizerpublic VocabCache fillSpecialVocabulary(SentenceIterator iterator, int minWord)
iterator
- protected List<VocabWord> digitizeSentence(List<String> tokens)
tokens
- - list of tokens from sentencepublic void fit() throws IOException
IOException
protected void addWords(List<VocabWord> sentence, AtomicLong nextRandom, List<VocabWord> currMiniBatch)
public void setup()
public boolean buildVocab()
public void trainSentence(List<VocabWord> sentence, AtomicLong nextRandom, double alpha)
sentence
- the list of vocab words to train onpublic void skipGram(int i, List<VocabWord> sentence, int b, AtomicLong nextRandom, double alpha)
i
- sentence
- public void iterate(VocabWord w1, VocabWord w2, AtomicLong nextRandom, double alpha)
w1
- the first word to fitprotected void buildBinaryTree()
protected void resetWeights()
protected void readStopWords()
public void setSentenceIter(SentenceIterator sentenceIter)
sentenceIter
- public void resetWeightsOnSetup()
public int getWindow()
public SentenceIterator getSentenceIter()
public TokenizerFactory getTokenizerFactory()
public void setTokenizerFactory(TokenizerFactory tokenizerFactory)
Copyright © 2015. All Rights Reserved.