#ifndef DATASET_HPP_ #define DATASET_HPP_ #include #include #include using namespace std; /** * Reads a full dataset; limited to those that fit in memory. A * pointer to this must be shared among all users. As of yet, works * only for classification datasets, where the last value on each row * is the integral class label and all other values are real-valued * features. * */ class Dataset { protected: vector < vector < double > > invec; /// input vectors vector < size_t > targetc; /// target classes 1,2,... vector < vector < double > > prototypes; /// target prototypes size_t nclasses; /// number of classes; discovered upon load vector < size_t > ninc; /// count of rows in each class 1,2... ninc[0] unused public: Dataset() = delete; /** Read from a file. TODO: differentiate classification / approx? */ Dataset(string fname, double enc0, double enc1); /** Look at an indexed row of data. TODO: Likely to replace this * with iterators that can be generated for splits. Or, rather, * objects of type DataSplit. */ const vector & row(size_t i) const; /** Return number of classes. TODO: classification vs. approx? */ size_t getNClasses() const; /** Return number of rows / instances */ size_t getNRows() const; /** * Return target class 1..N. NOTE: class indices are base 1 while * data indices are base 0! */ size_t getTargetClass(size_t ind) const; /** * Return the number of rows that belong to a given class. */ size_t getNRowsInClass(size_t c) const; /** * Return a binary vector encoding the target class; the real * values for the encoding must be given as a constructor * parameter; default is -1 for non-class elements and +1 for * class elements. */ const vector & prototype(size_t row); /** Dump data to a stream.*/ void toStream(ostream & ost) const; }; #endif