public class Word2Vec extends WordVectorsImpl
Modifier and Type | Class and Description |
---|---|
static class |
Word2Vec.Builder |
Modifier and Type | Field and Description |
---|---|
protected com.google.common.util.concurrent.AtomicDouble |
alpha |
protected int |
batchSize |
protected DocumentIterator |
docIter |
protected org.apache.commons.math3.random.RandomGenerator |
g |
protected InvertedIndex |
invertedIndex |
protected int |
learningRateDecayWords |
protected static org.slf4j.Logger |
log |
protected double |
minLearningRate |
protected int |
numIterations |
protected double |
sample |
protected boolean |
saveVocab |
protected long |
seed |
protected SentenceIterator |
sentenceIter |
protected static long |
serialVersionUID |
protected boolean |
shouldReset |
protected TokenizerFactory |
tokenizerFactory |
protected long |
totalWords |
static String |
UNK |
protected boolean |
useAdaGrad |
protected TextVectorizer |
vectorizer |
protected int |
window |
protected int |
workers |
layerSize, lookupTable, minWordFrequency, stopWords, vocab
Constructor and Description |
---|
Word2Vec() |
Modifier and Type | Method and Description |
---|---|
protected void |
addWords(List<VocabWord> sentence,
AtomicLong nextRandom,
List<VocabWord> currMiniBatch) |
protected void |
buildBinaryTree() |
boolean |
buildVocab()
Builds the vocabulary for training
|
void |
fit()
Train the model
|
SentenceIterator |
getSentenceIter() |
List<String> |
getStopWords() |
TokenizerFactory |
getTokenizerFactory() |
TextVectorizer |
getVectorizer() |
int |
getWindow() |
void |
iterate(VocabWord w1,
VocabWord w2,
AtomicLong nextRandom,
double alpha)
Train the word vector
on the given words
|
protected void |
readStopWords() |
protected void |
resetWeights() |
void |
resetWeightsOnSetup()
restart training on next fit().
|
void |
setSentenceIter(SentenceIterator sentenceIter)
Note that calling a setter on this
means assumes that this is a training continuation
and therefore weights should not be reset.
|
void |
setTokenizerFactory(TokenizerFactory tokenizerFactory) |
void |
setup()
Build the binary tree
Reset the weights
|
void |
setVectorizer(TextVectorizer vectorizer) |
void |
skipGram(int i,
List<VocabWord> sentence,
int b,
AtomicLong nextRandom,
double alpha)
Train via skip gram
|
void |
trainSentence(List<VocabWord> sentence,
AtomicLong nextRandom,
double alpha)
Train on a list of vocab words
|
accuracy, getWordVector, getWordVectorMatrix, getWordVectorMatrixNormalized, hasWord, indexOf, lookupTable, setLookupTable, setVocab, similarity, similarWordsInVocabTo, vocab, wordsNearest, wordsNearest, wordsNearest, wordsNearestSum, wordsNearestSum, wordsNearestSum
protected static final long serialVersionUID
protected transient TokenizerFactory tokenizerFactory
protected transient SentenceIterator sentenceIter
protected transient DocumentIterator docIter
protected int batchSize
protected double sample
protected long totalWords
protected com.google.common.util.concurrent.AtomicDouble alpha
protected int window
protected transient org.apache.commons.math3.random.RandomGenerator g
protected static final org.slf4j.Logger log
protected boolean shouldReset
protected int numIterations
public static final String UNK
protected long seed
protected boolean saveVocab
protected double minLearningRate
protected transient TextVectorizer vectorizer
protected int learningRateDecayWords
protected InvertedIndex invertedIndex
protected boolean useAdaGrad
protected int workers
public TextVectorizer getVectorizer()
public void setVectorizer(TextVectorizer vectorizer)
public void fit() throws IOException
IOException
protected void addWords(List<VocabWord> sentence, AtomicLong nextRandom, List<VocabWord> currMiniBatch)
public void setup()
public boolean buildVocab()
public void trainSentence(List<VocabWord> sentence, AtomicLong nextRandom, double alpha)
sentence
- the list of vocab words to train onpublic void skipGram(int i, List<VocabWord> sentence, int b, AtomicLong nextRandom, double alpha)
i
- sentence
- public void iterate(VocabWord w1, VocabWord w2, AtomicLong nextRandom, double alpha)
w1
- the first word to fitprotected void buildBinaryTree()
protected void resetWeights()
protected void readStopWords()
public void setSentenceIter(SentenceIterator sentenceIter)
sentenceIter
- public void resetWeightsOnSetup()
public int getWindow()
public SentenceIterator getSentenceIter()
public TokenizerFactory getTokenizerFactory()
public void setTokenizerFactory(TokenizerFactory tokenizerFactory)
Copyright © 2015. All Rights Reserved.