public class Word2Vec extends Object implements Persistable
Modifier and Type | Class and Description |
---|---|
static class |
Word2Vec.Builder |
Constructor and Description |
---|
Word2Vec() |
Modifier and Type | Method and Description |
---|---|
TreeSet<VocabWord> |
analogy(String word0,
String word1,
String word2) |
List<String> |
analogyWords(String w1,
String w2,
String w3)
Brings back a list of words that are analagous to the 3 words
presented in vector space
|
boolean |
buildVocab()
Builds the vocabulary for training
|
Set<VocabWord> |
distance(String word) |
void |
fit()
Train the model
|
VocabCache |
getCache() |
int |
getLayerSize() |
SentenceIterator |
getSentenceIter() |
List<String> |
getStopWords() |
TokenizerFactory |
getTokenizerFactory() |
int |
getWindow() |
double[] |
getWordVector(String word)
Get the word vector for a given matrix
|
org.nd4j.linalg.api.ndarray.INDArray |
getWordVectorMatrix(String word)
Get the word vector for a given matrix
|
org.nd4j.linalg.api.ndarray.INDArray |
getWordVectorMatrixNormalized(String word)
Returns the word vector divided by the norm2 of the array
|
boolean |
hasWord(String word)
Returns true if the model has this word in the vocab
|
int |
indexOf(String word) |
void |
iterate(VocabWord w1,
VocabWord w2)
Train the word vector
on the given words
|
void |
load(InputStream is) |
void |
plotTsne()
Create a tsne plot
|
void |
setCache(VocabCache cache) |
void |
setLayerSize(int layerSize) |
void |
setSentenceIter(SentenceIterator sentenceIter)
Note that calling a setter on this
means assumes that this is a training continuation
and therefore weights should not be reset.
|
void |
setTokenizerFactory(TokenizerFactory tokenizerFactory) |
void |
setup() |
double |
similarity(String word,
String word2)
Returns the similarity of 2 words
|
List<String> |
similarWordsInVocabTo(String word,
double accuracy)
Find all words with a similar characters
in the vocab
|
void |
skipGram(int i,
List<VocabWord> sentence,
int b)
Train via skip gram
|
Map<String,org.nd4j.linalg.api.ndarray.INDArray> |
toVocabFloat() |
List<VocabWord> |
trainSentence(InputStream is)
Train on the given sentence returning a list of vocab words
|
void |
trainSentence(List<VocabWord> sentence,
int doc)
Train on a list of vocab words
|
List<VocabWord> |
trainSentence(String sentence)
Train on the given sentence returning a list of vocab words
|
Collection<String> |
wordsNearest(String word,
int n)
Get the top n words most similar to the given word
|
void |
write(OutputStream os) |
public static final String UNK
public List<String> similarWordsInVocabTo(String word, double accuracy)
word
- the word to compareaccuracy
- the accuracy: 0 to 1public int indexOf(String word)
public double[] getWordVector(String word)
word
- the word to get the matrix forpublic org.nd4j.linalg.api.ndarray.INDArray getWordVectorMatrix(String word)
word
- the word to get the matrix forpublic org.nd4j.linalg.api.ndarray.INDArray getWordVectorMatrixNormalized(String word)
word
- the word to get the matrix forpublic Collection<String> wordsNearest(String word, int n)
word
- the word to comparen
- the n to getpublic List<String> analogyWords(String w1, String w2, String w3)
w1
- w2
- w3
- public boolean hasWord(String word)
word
- the word to test forpublic void fit()
public List<VocabWord> trainSentence(InputStream is)
is
- the sentence to fit onpublic List<VocabWord> trainSentence(String sentence)
sentence
- the sentence to fit onpublic void setup()
public boolean buildVocab()
public void plotTsne()
public void trainSentence(List<VocabWord> sentence, int doc)
sentence
- the list of vocab words to train onpublic void skipGram(int i, List<VocabWord> sentence, int b)
i
- sentence
- public void iterate(VocabWord w1, VocabWord w2)
w1
- the first word to fitpublic double similarity(String word, String word2)
word
- the first wordword2
- the second wordpublic void write(OutputStream os)
write
in interface Persistable
public void load(InputStream is)
load
in interface Persistable
public int getLayerSize()
public void setLayerSize(int layerSize)
public int getWindow()
public SentenceIterator getSentenceIter()
public TokenizerFactory getTokenizerFactory()
public void setTokenizerFactory(TokenizerFactory tokenizerFactory)
public VocabCache getCache()
public void setCache(VocabCache cache)
public void setSentenceIter(SentenceIterator sentenceIter)
sentenceIter
- Copyright © 2014. All rights reserved.