mi024

College project "Projet IAD" master 1
git clone https://esimon.eu/repos/mi024.git
Log | Files | Refs | README

data_set.cpp (3150B)


      1 #include <cstddef>
      2 #include <fstream>
      3 #include <string>
      4 #include <vector>
      5 
      6 #include <boost/make_shared.hpp>
      7 #include <boost/shared_ptr.hpp>
      8 #include <boost/tokenizer.hpp>
      9 #include <nmlp/Matrix.h>
     10 #include <nmlp/Tensor.h>
     11 
     12 #include "data_set.hpp"
     13 
     14 Data_set::Data_set(boost::shared_ptr<Tensor> views, std::vector<std::string> const &view_kinds): views(views), view_kinds(view_kinds) {}
     15 
     16 std::size_t Data_set::size() const {
     17 	// nmlp is const-inconsistent.
     18 	Data_set * const ncthis=const_cast<Data_set*>(this);
     19 	return ncthis->views->getMatrix(0)->getNumberOfRows();
     20 }
     21 
     22 std::size_t Data_set::number_of_views() const {
     23 	return view_kinds.size();
     24 }
     25 
     26 std::string Data_set::kind(std::size_t id) const {
     27 	return view_kinds[id];
     28 }
     29 
     30 boost::shared_ptr<Matrix> Data_set::get(std::size_t element, std::size_t view) const {
     31 	// nmlp is const-inconsistent.
     32 	Data_set * const ncthis=const_cast<Data_set*>(this);
     33 	boost::shared_ptr<Matrix> mat=ncthis->views->getMatrix(view);
     34 	boost::shared_ptr<Matrix> ret=boost::make_shared<CPUMatrix>(1, mat->getNumberOfColumns());
     35 	for(std::size_t x=0; x<mat->getNumberOfColumns(); ++x)
     36 		ret->setValue(0, x, mat->getValue(element, x));
     37 	return ret;
     38 }
     39 
     40 boost::shared_ptr<Matrix> Data_set::get(std::size_t view) const {
     41 	// nmlp is const-inconsistent.
     42 	Data_set * const ncthis=const_cast<Data_set*>(this);
     43 	return ncthis->views->getMatrix(view);
     44 }
     45 
     46 Data_set classification_svmfile_to_data_set(std::string const &filepath, std::vector<std::string> kinds){
     47 	if(kinds.size()!=2)
     48 		throw std::runtime_error("Classification SVM files always represent 2 views, but the given kinds vector is not of size 2");
     49 
     50 	std::ifstream file(filepath);
     51 	if(!file)
     52 		throw std::runtime_error("Can't open SVM file");
     53 	
     54 	std::vector<float> output;
     55 	std::vector<std::vector<float> > input;
     56 
     57 	std::string buf;
     58 	boost::char_separator<char> separators(" \t:");
     59 	std::size_t max_feature=1;
     60 	while(std::getline(file, buf)){
     61 		typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
     62 		tokenizer tokens(buf, separators);
     63 		tokenizer::iterator token=tokens.begin();
     64 
     65 		output.push_back(boost::lexical_cast<float>(*token++));
     66 		input.push_back(std::vector<float>(max_feature));
     67 
     68 		while(token!=tokens.end()){
     69 			std::size_t const k=boost::lexical_cast<std::size_t>(*token++);
     70 			if(token==tokens.end())
     71 				break;
     72 			float const v=boost::lexical_cast<float>(*token++);
     73 
     74 			if(k>max_feature){
     75 				max_feature=k;
     76 				for(std::vector<std::vector<float> >::iterator it=input.begin(); it!=input.end(); ++it)
     77 					it->resize(max_feature);
     78 			}
     79 
     80 			input.back()[k-1]=v;
     81 		}
     82 	}
     83 
     84 	boost::shared_ptr<CPUMatrix> output_matrix=boost::make_shared<CPUMatrix>(output.size(), 1);
     85 	boost::shared_ptr<CPUMatrix> input_matrix=boost::make_shared<CPUMatrix>(input.size(), max_feature);
     86 	for(std::size_t row=0; row<input.size(); ++row){
     87 		output_matrix->setValue(row, 0, output[row]);
     88 		for(std::size_t column=0; column<max_feature; ++column)
     89 			input_matrix->setValue(row, column, input[row][column]);
     90 	}
     91 
     92 	boost::shared_ptr<Tensor> data=boost::make_shared<Tensor>(2);
     93 	data->setMatrix(0, input_matrix);
     94 	data->setMatrix(1, output_matrix);
     95 	return Data_set(data, kinds);
     96 }
     97