[mlpack] 143/207: first commit

Barak A. Pearlmutter barak+git at pearlmutter.net
Thu Mar 23 17:53:48 UTC 2017


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit c1e1649810a2768eb0ac05a5b6a1f3a30e211113
Author: stereomatchingkiss <stereomatchingkiss at gmail.com>
Date:   Sat Jun 4 21:48:55 2016 +0800

    first commit
---
 src/mlpack/core/data/load_csv.hpp | 281 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 281 insertions(+)

diff --git a/src/mlpack/core/data/load_csv.hpp b/src/mlpack/core/data/load_csv.hpp
new file mode 100644
index 0000000..aab567f
--- /dev/null
+++ b/src/mlpack/core/data/load_csv.hpp
@@ -0,0 +1,281 @@
+/**
+ * @file load_csv.hpp
+ * @author ThamNgapWei
+ *
+ * This is a csv parsers which use to parse the csv file format
+ */
+#ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP
+#define MLPACK_CORE_DATA_LOAD_CSV_HPP
+
+#include<boost/spirit/include/qi.hpp>
+
+#include <mlpack/core/util/log.hpp>
+#include <mlpack/core/arma_extend/arma_extend.hpp> // Includes Armadillo.
+#include <string>
+
+#include "format.hpp"
+#include "dataset_info.hpp"
+
+namespace mlpack {
+namespace data /** Functions to load and save matrices and models. */ {
+
+namespace details /** Implementation details, please do not use them in production codes */{
+
+//put the implementation details do not depend on template params
+//out of template class, this could reduce duplicate binary codes
+//if the compiler/linker not smart enough
+
+}
+
+/**
+ *Load the csv file.This class use boost::spirit
+ *to implement the parser, please refer to following link
+ *http://theboostcpplibraries.com/boost.spirit for quick review.
+ */
+class LoadCSV
+{      
+public:
+  explicit LoadCSV(std::string const &file) : inFile(file)
+  {
+    if(!inFile.is_open())
+    {
+      throw std::runtime_error("LoadCSV can not open file");
+    }
+    inFile.unsetf(std::ios::skipws);
+  }
+
+  template<typename T>
+  void Load(arma::Mat<T> &inout, DatasetInfo &infoSet, bool transpose = true)
+  {
+    //please refer to the comments of ColSize if you do not familiar
+    //with boost::spirit yet
+    if(transpose)
+    {
+      TranposeParse(inout, infoSet);
+    }
+    else
+    {
+      NonTranposeParse(inout, infoSet);
+    }
+  }
+
+  size_t ColSize()
+  {
+    //boost tokenizer or strtok can do the same thing, I use
+    //spirit at here because I think this is a nice example
+    using namespace boost::spirit;
+    using bsi_type = boost::spirit::istream_iterator;
+    using iter_type = boost::iterator_range<bsi_type>;
+
+    inFile.clear();
+    inFile.seekg(0, std::ios::beg);
+    //spirit::qi requires iterators to be atleast forward iterators,
+    //but std::istream_iterator is input iteraotr, so we use
+    //boost::spirit::istream_iterator to overcome this problem
+    bsi_type begin(inFile);
+    bsi_type end;
+    size_t col = 0;
+
+    //the parser of boost spirit can work with "actions"(functor)
+    //when the parser find match target, this functor will be executed
+    auto findColSize = [&col](iter_type){ ++col; };
+
+    //qi::char_ bite an character
+    //qi::char_(",\r\n") only bite a "," or "\r" or "\n" character
+    //* means the parser(ex : qi::char_) can bite [0, any size] of characters
+    //~ means negate, so ~qi::char_(",\r\n") means I want to bite anything except of ",\r\n"
+    //parse % "," means you want to parse string like "1,2,3,apple"(noticed it without last comma)
+
+    //qi::raw restrict the automatic conversion of boost::spirit, without it, spirit parser
+    //will try to convert the string to std::string, this would cause memory allocation
+    //After we wrap the parser with qi::raw, the attribute(the data accepted by functor) will
+    //become boost::iterator_range, this could save a tons of memory allocations
+    qi::parse(begin, end, qi::raw[*~qi::char_(",\r\n")][findColSize] % ",");
+
+    return col;
+  }
+
+  size_t RowSize()
+  {
+    inFile.clear();
+    inFile.seekg(0, std::ios::beg);
+    size_t row = 0;
+    std::string line;
+    while(std::getline(inFile, line))
+    {
+      ++row;
+    }
+
+    return row;
+  }
+
+private:
+  using iter_type = boost::iterator_range<std::string::iterator>;
+
+  struct ElemParser
+  {
+    //return int_parser if the type of T is_integral
+    template<typename T>
+    static typename std::enable_if<std::is_integral<T>::value,
+    boost::spirit::qi::int_parser<T>>::type
+    Parser()
+    {
+      return boost::spirit::qi::int_parser<T>();
+    }
+
+    //return real_parser if T is floating_point
+    template<typename T>
+    static typename std::enable_if<std::is_floating_point<T>::value,
+    boost::spirit::qi::real_parser<T>>::type
+    Parser()
+    {
+      return boost::spirit::qi::real_parser<T>();
+    }
+  };
+
+  template<typename T>
+  void NonTranposeParse(arma::Mat<T> &inout, DatasetInfo &infoSet)
+  {
+    using namespace boost::spirit;
+
+    size_t row = 0;
+    size_t col = 0;
+    infoSet = DatasetInfo(RowSize());
+    std::string line;
+    inout.set_size(infoSet.Dimensionality(), ColSize());
+    inFile.clear();
+    inFile.seekg(0, std::ios::beg);
+
+    auto setNum = [&](T val)
+    {
+      inout(row, col++) = val;
+    };
+    auto setCharClass = [&](iter_type const &iter)
+    {
+      inout(row, col++) =
+          static_cast<T>(infoSet.MapString(std::string(iter.begin(), iter.end()),
+                                           row));
+    };
+
+    qi::rule<std::string::iterator, T()> numRule = CreateNumRule<T>();
+    qi::rule<std::string::iterator, iter_type()> charRule = CreateCharRule();
+    while(std::getline(inFile, line))
+    {
+      auto begin = line.begin();
+      const bool allNumber =
+          qi::parse(begin, line.end(), numRule[setNum] % ",");
+      if(!allNumber)
+      {
+        begin = line.begin();
+        col = 0;
+        const bool canParse = qi::parse(begin, line.end(),
+                                        charRule[setCharClass] % ",");
+        if(!canParse)
+        {
+          throw std::runtime_error("LoadCSV cannot parse categories");
+          break;
+        }
+      }
+      ++row; col = 0;
+    }
+  }
+
+  template<typename T>
+  void TranposeParse(arma::Mat<T> &inout, DatasetInfo &infoSet)
+  {
+    infoSet = DatasetInfo(ColSize());
+    inout.set_size(infoSet.Dimensionality(), RowSize());
+    while(!TranposeParseImpl(inout, infoSet))
+    {
+
+    }
+  }
+
+  template<typename T>
+  bool TranposeParseImpl(arma::Mat<T> &inout, DatasetInfo &infoSet)
+  {
+    using namespace boost::spirit;
+
+    size_t row = 0;
+    size_t col = 0;
+    std::string line;
+    inFile.clear();
+    inFile.seekg(0, std::ios::beg);
+
+    auto setNum = [&](T val)
+    {
+      inout(row++, col) = val;
+    };
+    auto setCharClass = [&](iter_type const &iter)
+    {
+      inout(row++, col) =
+          static_cast<T>(infoSet.MapString(std::string(iter.begin(), iter.end()),
+                                           col));
+    };
+
+    qi::rule<std::string::iterator, T()> numRule = CreateNumRule<T>();
+    qi::rule<std::string::iterator, iter_type()> charRule = CreateCharRule();
+    while(std::getline(inFile, line))
+    {
+      auto begin = line.begin();
+      const bool allNumber =
+          qi::parse(begin, line.end(), numRule[setNum] % ",");
+      if(!allNumber)
+      {
+        begin = line.begin();
+        const size_t dimension = infoSet.NumMappings(col);
+        if((dimension == 0 && row == 0) || dimension != 0)
+        {
+          row = 0;
+          const bool canParse = qi::parse(begin, line.end(),
+                                          charRule[setCharClass] % ",");
+          if(!canParse)
+          {
+            throw std::runtime_error("LoadCSV cannot parse categories");
+          }
+        }
+        else
+        {
+          return false;
+        }
+      }
+      row = 0; ++col;
+    }
+
+    return true;
+  }
+
+  template<typename T>
+  boost::spirit::qi::rule<std::string::iterator, T()> CreateNumRule() const
+  {
+    using namespace boost::spirit;
+
+    //elemParser will generate integer or real parser based on T
+    auto elemParser = ElemParser::Parser<T>();
+    //qi::skip can specify which characters you want to skip,
+    //in this example, elemParser will parse int or double value,
+    //but we do not want space to intefere it, so we skip it by qi::skip
+
+    //qi::omit can omit the attributes of spirit, every parser of spirit
+    //has attribute(the type will pass into actions(functor))
+    //if you do not omit it, the attribute combine with attribute may
+    //change the attribute
+
+    //"-" means one or zero(same as "-" of EBNF)
+    return qi::skip(qi::char_(" "))[elemParser] >> -qi::omit[*qi::char_(" ")];
+  }
+
+  boost::spirit::qi::rule<std::string::iterator, iter_type()> CreateCharRule() const
+  {
+    using namespace boost::spirit;
+    return -qi::omit[*qi::char_(" ")] >> qi::raw[*~qi::char_(" ,\r\n")]
+        >> -qi::omit[*qi::char_(" ")];
+  }
+
+  std::ifstream inFile;
+};
+
+} // namespace data
+} // namespace mlpack
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list