public class DataSet extends Pair<org.jblas.DoubleMatrix,org.jblas.DoubleMatrix> implements Persistable, Iterable<DataSet>
Pair.DefaultLexicographicPairComparator<F extends Comparable<F>,S extends Comparable<S>>, Pair.FirstComparator<S extends Comparable<? super S>,T>, Pair.LexicographicPairComparator<F,S>, Pair.ReverseFirstComparator<S extends Comparable<? super S>,T>, Pair.ReverseSecondComparator<S,T extends Comparable<? super T>>, Pair.SecondComparator<S,T extends Comparable<? super T>>
Constructor and Description |
---|
DataSet() |
DataSet(org.jblas.DoubleMatrix first,
org.jblas.DoubleMatrix second) |
DataSet(Pair<org.jblas.DoubleMatrix,org.jblas.DoubleMatrix> pair) |
Modifier and Type | Method and Description |
---|---|
void |
addFeatureVector(org.jblas.DoubleMatrix toAdd)
Adds a feature for each example on to the current feature vector
|
void |
addFeatureVector(org.jblas.DoubleMatrix feature,
int example)
The feature to add, and the example/row number
|
void |
addRow(DataSet d,
int i) |
List<DataSet> |
asList() |
List<List<DataSet>> |
batchBy(int num) |
List<List<DataSet>> |
batchByNumLabels() |
DataSet |
copy() |
List<DataSet> |
dataSetBatches(int num)
Partitions the data set by the specified number.
|
void |
divideBy(int num) |
static DataSet |
empty() |
org.jblas.DoubleMatrix |
exampleMaxs() |
org.jblas.DoubleMatrix |
exampleMeans() |
org.jblas.DoubleMatrix |
exampleSums() |
void |
filterAndStrip(int[] labels)
Strips the dataset down to the specified labels
and remaps them
|
DataSet |
filterBy(int[] labels)
Strips the data set of all but the passed in labels
|
DataSet |
get(int i)
Gets a copy of example i
|
Iterator<DataSet> |
iterator() |
DataSetIterator |
iterator(int batches) |
static DataSet |
load(File path) |
void |
load(InputStream is) |
static void |
main(String[] args) |
static DataSet |
merge(List<DataSet> data) |
void |
multiplyBy(int num) |
void |
normalize() |
void |
normalizeZeroMeanZeroUnitVariance() |
int |
numExamples() |
int |
numInputs() |
int |
numOutcomes() |
int |
outcome() |
Counter<Integer> |
outcomeCounts()
Gets the label distribution (counts of each possible outcome)
|
void |
roundInputToTheNearest(int numDecimalPlaces) |
void |
roundToTheNearest(int roundTo) |
DataSet |
sample(int numSamples)
Sample without replacement and a random rng
|
DataSet |
sample(int numSamples,
boolean withReplacement)
Sample a dataset numSamples times
|
DataSet |
sample(int numSamples,
org.apache.commons.math3.random.RandomGenerator rng)
Sample without replacement
|
DataSet |
sample(int numSamples,
org.apache.commons.math3.random.RandomGenerator rng,
boolean withReplacement)
Sample a dataset
|
void |
saveTo(File file,
boolean binary) |
void |
scale() |
void |
setNewNumberOfLabels(int labels)
Clears the outcome matrix setting a new number of labels
|
void |
setOutcome(int example,
int label)
Sets the outcome of a particular example
|
void |
shuffle() |
List<List<DataSet>> |
sortAndBatchByNumLabels()
Sorts the dataset by label:
Splits the data set such that examples are sorted by their labels.
|
void |
sortByLabel()
Organizes the dataset to minimize sampling error
while still allowing efficient batching.
|
Pair<DataSet,DataSet> |
splitTestAndTrain(int numHoldout) |
String |
toString() |
void |
validate() |
void |
write(OutputStream os) |
public DataSet()
public DataSet(Pair<org.jblas.DoubleMatrix,org.jblas.DoubleMatrix> pair)
public DataSet(org.jblas.DoubleMatrix first, org.jblas.DoubleMatrix second)
public DataSetIterator iterator(int batches)
public DataSet copy()
public static DataSet empty()
public void multiplyBy(int num)
public void divideBy(int num)
public void shuffle()
public void roundInputToTheNearest(int numDecimalPlaces)
public void scale()
public void addFeatureVector(org.jblas.DoubleMatrix toAdd)
toAdd
- the feature vector to addpublic void addFeatureVector(org.jblas.DoubleMatrix feature, int example)
feature
- the feature vector to addexample
- the number of the example to append topublic void normalize()
public void normalizeZeroMeanZeroUnitVariance()
public int numInputs()
public void validate()
public int outcome()
public void setNewNumberOfLabels(int labels)
labels
- the number of labels/columns in the outcome matrix
Note that this clears the labels for each examplepublic void setOutcome(int example, int label)
example
- the example to setlabel
- the label of the outcomepublic DataSet get(int i)
i
- the example to getpublic Counter<Integer> outcomeCounts()
public DataSet filterBy(int[] labels)
labels
- strips the data set of all but the passed in labelspublic void filterAndStrip(int[] labels)
labels
- the labels to strip down topublic List<DataSet> dataSetBatches(int num)
num
- the number to split bypublic List<List<DataSet>> sortAndBatchByNumLabels()
public void sortByLabel()
public void addRow(DataSet d, int i)
public org.jblas.DoubleMatrix exampleSums()
public org.jblas.DoubleMatrix exampleMaxs()
public org.jblas.DoubleMatrix exampleMeans()
public void saveTo(File file, boolean binary) throws IOException
IOException
public static DataSet load(File path) throws IOException
IOException
public DataSet sample(int numSamples)
numSamples
- the number of samples to getpublic DataSet sample(int numSamples, org.apache.commons.math3.random.RandomGenerator rng)
numSamples
- the number of samples to getrng
- the rng to usepublic DataSet sample(int numSamples, boolean withReplacement)
numSamples
- the number of samples to getwithReplacement
- the rng to usepublic DataSet sample(int numSamples, org.apache.commons.math3.random.RandomGenerator rng, boolean withReplacement)
numSamples
- the number of samples to getrng
- the rng to usewithReplacement
- whether to allow duplicates (only tracked by example row number)public void roundToTheNearest(int roundTo)
public int numOutcomes()
public int numExamples()
public String toString()
public static void main(String[] args) throws IOException
IOException
public void write(OutputStream os)
write
in interface Persistable
public void load(InputStream is)
load
in interface Persistable
Copyright © 2014. All Rights Reserved.