[mlpack] 190/207: Refactor MapPolicy classes to also allow a first pass.
Barak A. Pearlmutter
barak+git at pearlmutter.net
Thu Mar 23 17:53:53 UTC 2017
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch master
in repository mlpack.
commit 2f22721425b96d3a8c6616948c7df09fed6beead
Author: Ryan Curtin <ryan at ratml.org>
Date: Sat Mar 18 13:25:36 2017 -0400
Refactor MapPolicy classes to also allow a first pass.
This might be necessary for example for the IncrementPolicy class where we must
know at the outset whether each dimension is categorical or numeric.
---
.../core/data/map_policies/increment_policy.hpp | 66 +++++++++++++++++++---
.../core/data/map_policies/missing_policy.hpp | 64 ++++++++++++++++-----
2 files changed, 108 insertions(+), 22 deletions(-)
diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp
index 3c6c010..1676fbf 100644
--- a/src/mlpack/core/data/map_policies/increment_policy.hpp
+++ b/src/mlpack/core/data/map_policies/increment_policy.hpp
@@ -32,6 +32,38 @@ class IncrementPolicy
// typedef of MappedType
using MappedType = size_t;
+ //! We do need a first pass over the data to set the dimension types right.
+ static const bool NeedsFirstPass = true;
+
+ /**
+ * Determine if the dimension is numeric or categorical.
+ */
+ template<typename T>
+ void MapFirstPass(const std::string& string,
+ const size_t dim,
+ std::vector<Datatype>& types)
+ {
+ if (types[dim] == Datatype::categorical)
+ {
+ // No need to check; it's already categorical.
+ return;
+ }
+
+ // Otherwise we need to attempt to read the value. If the read fails, the
+ // dimension is categorical; otherwise we leave it at the default of
+ // numeric.
+ std::stringstream token;
+ token.str(string);
+ T val;
+ token >> val;
+
+ if (token.fail() || !token.eof())
+ {
+ // Parsing failed; the dimension is categorical.
+ types[dim] = Datatype::categorical;
+ }
+ }
+
/**
* Given the string and the dimension to which the it belongs, and the maps
* and types given by the DatasetMapper class, returns its numeric mapping.
@@ -45,12 +77,32 @@ class IncrementPolicy
* @param maps Unordered map given by the DatasetMapper.
* @param types Vector containing the type information about each dimensions.
*/
- template <typename MapType>
- MappedType MapString(const std::string& string,
- const size_t dimension,
- MapType& maps,
- std::vector<Datatype>& types)
+ template<typename MapType, typename T>
+ T MapString(const std::string& string,
+ const size_t dimension,
+ MapType& maps,
+ std::vector<Datatype>& types)
{
+ // If we are in a categorical dimension we already know we need to map.
+ if (types[dimension] == Datatype::numeric)
+ {
+ // Check if this string needs to be mapped or if it can be read
+ // directly as a number. This will be true if nothing else in this
+ // dimension has yet been mapped, but this can't be read as a number.
+ std::stringstream token;
+ token.str(string);
+ T val;
+ token >> val;
+
+ if (!token.fail() && token.eof())
+ {
+ // We can return what we have.
+ return val;
+ }
+ }
+
+ // The token must be mapped.
+
// If this condition is true, either we have no mapping for the given string
// or we have no mappings for the given dimension at all. In either case,
// we create a mapping.
@@ -60,13 +112,13 @@ class IncrementPolicy
// This string does not exist yet.
size_t& numMappings = maps[dimension].second;
- // change type of the feature to categorical
+ // Change type of the feature to categorical.
if (numMappings == 0)
types[dimension] = Datatype::categorical;
typedef boost::bimap<std::string, MappedType>::value_type PairType;
maps[dimension].first.insert(PairType(string, numMappings));
- return numMappings++;
+ return T(numMappings++);
}
else
{
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index e8bb115..a87126c 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -52,6 +52,19 @@ class MissingPolicy
// Nothing to initialize here.
}
+ //! This doesn't need a first pass over the data to set up.
+ static const bool NeedsFirstPass = false;
+
+ /**
+ * There is nothing for us to do here, but this is required by the MapPolicy
+ * type.
+ */
+ template<typename T>
+ void MapFirstPass(const std::string& /* string */, const size_t /* dim */)
+ {
+ // Nothing to do.
+ }
+
/**
* Given the string and the dimension to which it belongs by the user, and
* the maps and types given by the DatasetMapper class, returns its numeric
@@ -66,24 +79,45 @@ class MissingPolicy
* @param maps Unordered map given by the DatasetMapper.
* @param types Vector containing the type information about each dimensions.
*/
- template <typename MapType>
- MappedType MapString(const std::string& string,
- const size_t dimension,
- MapType& maps,
- std::vector<Datatype>& /* types */)
+ template<typename MapType, typename T>
+ T MapString(const std::string& string,
+ const size_t dimension,
+ MapType& maps,
+ std::vector<Datatype>& /* types */)
{
- // Everything is mapped to NaN. However we must still keep track of
- // everything that we have mapped, so we add it to the maps if needed.
- if (maps.count(dimension) == 0 ||
- maps[dimension].first.left.count(string) == 0)
+ static_assert(std::numeric_limits<T>::has_quiet_NaN == true,
+ "Cannot use MissingPolicy with types where has_quiet_NaN() is false!");
+
+ // If we can load the string then there is no need for mapping.
+ std::stringstream token;
+ token.str(string);
+ T t;
+ token >> t; // Could be sped up by only doing this if we need to.
+
+ // If extraction of the value fails, or if it is a value that is supposed to
+ // be mapped, then do mapping.
+ if (token.fail() || !token.eof() ||
+ missingSet.find(string) != std::end(missingSet))
{
- // This string does not exist yet.
- typedef boost::bimap<std::string, MappedType>::value_type PairType;
- maps[dimension].first.insert(PairType(string, NaN));
- maps[dimension].second++;
- }
+ // Everything is mapped to NaN. However we must still keep track of
+ // everything that we have mapped, so we add it to the maps if needed.
+ if (maps.count(dimension) == 0 ||
+ maps[dimension].first.left.count(string) == 0)
+ {
+ // This string does not exist yet.
+ typedef boost::bimap<std::string, MappedType>::value_type PairType;
+ maps[dimension].first.insert(PairType(string,
+ std::numeric_limits<MappedType>::quiet_NaN()));
+ maps[dimension].second++;
+ }
- return std::numeric_limits<MappedType>::quiet_NaN();
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ else
+ {
+ // We can just return the value that we read.
+ return t;
+ }
}
/**
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list