[mlpack] 148/207: use LoadCSV to implement csv/tsv/txt loader

Barak A. Pearlmutter barak+git at pearlmutter.net
Thu Mar 23 17:53:49 UTC 2017


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit 2b5b84f681bebc6616abc232fe372e555deb6407
Author: stereomatchingkiss <stereomatchingkiss at gmail.com>
Date:   Sun Jun 5 10:57:39 2016 +0800

    use LoadCSV to implement csv/tsv/txt loader
---
 src/mlpack/core/data/load_impl.hpp | 110 ++-----------------------------------
 1 file changed, 4 insertions(+), 106 deletions(-)

diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 7434b4c..7b407da 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -15,6 +15,7 @@
 // In case it hasn't already been included.
 #include "load.hpp"
 #include "extension.hpp"
+#include "load_csv.hpp"
 
 #include <exception>
 #include <algorithm>
@@ -368,115 +369,12 @@ bool Load(const std::string& filename,
   Timer::Start("loading_data");
 
   // Get the extension.
-  std::string extension = Extension(filename);
-
-  // Catch nonexistent files by opening the stream ourselves.
-  std::fstream stream;
-  stream.open(filename.c_str(), std::fstream::in);
-
-  if (!stream.is_open())
-  {
-    Timer::Stop("loading_data");
-    if (fatal)
-      Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
-    else
-      Log::Warn << "Cannot open file '" << filename << "'; load failed."
-          << std::endl;
-
-    return false;
-  }
+  const std::string extension = Extension(filename);
 
   if (extension == "csv" || extension == "tsv" || extension == "txt")
   {
-    // True if we're looking for commas; if false, we're looking for spaces.
-    bool commas = (extension == "csv");
-
-    std::string type;
-    if (extension == "csv")
-      type = "CSV data";
-    else
-      type = "raw ASCII-formatted data";
-
-    Log::Info << "Loading '" << filename << "' as " << type << ".  "
-        << std::flush;
-    std::string separators;
-    if (commas)
-      separators = ",";
-    else
-      separators = " \t";
-
-    // We'll load this as CSV (or CSV with spaces or tabs) according to
-    // RFC4180.  So the first thing to do is determine the size of the matrix.
-    std::string buffer;
-    size_t cols = 0;
-
-    std::getline(stream, buffer, '\n');
-    // Count commas and whitespace in the line, ignoring anything inside
-    // quotes.
-    typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
-    boost::escaped_list_separator<char> sep("\\", separators, "\"");
-    Tokenizer tok(buffer, sep);
-    for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i)
-      ++cols;
-
-    // Now count the number of lines in the file.  We've already counted the
-    // first one.
-    size_t rows = 1;
-    while (!stream.eof() && !stream.bad() && !stream.fail())
-    {
-      std::getline(stream, buffer, '\n');
-      if (!stream.fail())
-        ++rows;
-    }
-
-    // Now we have the size.  So resize our matrix.
-    if (transpose)
-    {
-      matrix.set_size(cols, rows);
-      info = DatasetMapper<PolicyType>(info.Policy(), cols);
-    }
-    else
-    {
-      matrix.set_size(rows, cols);
-      info = DatasetMapper<PolicyType>(info.Policy(), rows);
-    }
-
-    stream.close();
-    stream.open(filename, std::fstream::in);
-
-    if (transpose)
-    {
-      std::vector<std::vector<std::string>> tokensArray;
-      std::vector<std::string> tokens;
-      while (!stream.bad() && !stream.fail() && !stream.eof())
-      {
-        // Extract line by line.
-        std::getline(stream, buffer, '\n');
-        Tokenizer lineTok(buffer, sep);
-        tokens = details::ToTokens(lineTok);
-        if (tokens.size() == cols)
-        {
-          tokensArray.emplace_back(std::move(tokens));
-        }
-      }
-      for(size_t i = 0; i != cols; ++i)
-      {
-        details::TransposeTokens(tokensArray, tokens, i);
-        info.MapTokens(tokens, i, matrix);
-      }
-    }
-    else
-    {
-      size_t row = 0;
-      while (!stream.bad() && !stream.fail() && !stream.eof())
-      {
-        // Extract line by line.
-        std::getline(stream, buffer, '\n');
-        Tokenizer lineTok(buffer, sep);
-        info.MapTokens(details::ToTokens(lineTok), row, matrix);
-        ++row;
-      }
-    }
+    LoadCSV loader(filename);
+    loader.Load(matrix, info, transpose);
   }
   else if (extension == "arff")
   {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list