[mlpack] 148/207: use LoadCSV to implement csv/tsv/txt loader
Barak A. Pearlmutter
barak+git at pearlmutter.net
Thu Mar 23 17:53:49 UTC 2017
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch master
in repository mlpack.
commit 2b5b84f681bebc6616abc232fe372e555deb6407
Author: stereomatchingkiss <stereomatchingkiss at gmail.com>
Date: Sun Jun 5 10:57:39 2016 +0800
use LoadCSV to implement csv/tsv/txt loader
---
src/mlpack/core/data/load_impl.hpp | 110 ++-----------------------------------
1 file changed, 4 insertions(+), 106 deletions(-)
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 7434b4c..7b407da 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -15,6 +15,7 @@
// In case it hasn't already been included.
#include "load.hpp"
#include "extension.hpp"
+#include "load_csv.hpp"
#include <exception>
#include <algorithm>
@@ -368,115 +369,12 @@ bool Load(const std::string& filename,
Timer::Start("loading_data");
// Get the extension.
- std::string extension = Extension(filename);
-
- // Catch nonexistent files by opening the stream ourselves.
- std::fstream stream;
- stream.open(filename.c_str(), std::fstream::in);
-
- if (!stream.is_open())
- {
- Timer::Stop("loading_data");
- if (fatal)
- Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
- else
- Log::Warn << "Cannot open file '" << filename << "'; load failed."
- << std::endl;
-
- return false;
- }
+ const std::string extension = Extension(filename);
if (extension == "csv" || extension == "tsv" || extension == "txt")
{
- // True if we're looking for commas; if false, we're looking for spaces.
- bool commas = (extension == "csv");
-
- std::string type;
- if (extension == "csv")
- type = "CSV data";
- else
- type = "raw ASCII-formatted data";
-
- Log::Info << "Loading '" << filename << "' as " << type << ". "
- << std::flush;
- std::string separators;
- if (commas)
- separators = ",";
- else
- separators = " \t";
-
- // We'll load this as CSV (or CSV with spaces or tabs) according to
- // RFC4180. So the first thing to do is determine the size of the matrix.
- std::string buffer;
- size_t cols = 0;
-
- std::getline(stream, buffer, '\n');
- // Count commas and whitespace in the line, ignoring anything inside
- // quotes.
- typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
- boost::escaped_list_separator<char> sep("\\", separators, "\"");
- Tokenizer tok(buffer, sep);
- for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i)
- ++cols;
-
- // Now count the number of lines in the file. We've already counted the
- // first one.
- size_t rows = 1;
- while (!stream.eof() && !stream.bad() && !stream.fail())
- {
- std::getline(stream, buffer, '\n');
- if (!stream.fail())
- ++rows;
- }
-
- // Now we have the size. So resize our matrix.
- if (transpose)
- {
- matrix.set_size(cols, rows);
- info = DatasetMapper<PolicyType>(info.Policy(), cols);
- }
- else
- {
- matrix.set_size(rows, cols);
- info = DatasetMapper<PolicyType>(info.Policy(), rows);
- }
-
- stream.close();
- stream.open(filename, std::fstream::in);
-
- if (transpose)
- {
- std::vector<std::vector<std::string>> tokensArray;
- std::vector<std::string> tokens;
- while (!stream.bad() && !stream.fail() && !stream.eof())
- {
- // Extract line by line.
- std::getline(stream, buffer, '\n');
- Tokenizer lineTok(buffer, sep);
- tokens = details::ToTokens(lineTok);
- if (tokens.size() == cols)
- {
- tokensArray.emplace_back(std::move(tokens));
- }
- }
- for(size_t i = 0; i != cols; ++i)
- {
- details::TransposeTokens(tokensArray, tokens, i);
- info.MapTokens(tokens, i, matrix);
- }
- }
- else
- {
- size_t row = 0;
- while (!stream.bad() && !stream.fail() && !stream.eof())
- {
- // Extract line by line.
- std::getline(stream, buffer, '\n');
- Tokenizer lineTok(buffer, sep);
- info.MapTokens(details::ToTokens(lineTok), row, matrix);
- ++row;
- }
- }
+ LoadCSV loader(filename);
+ loader.Load(matrix, info, transpose);
}
else if (extension == "arff")
{
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list