[mlpack] 190/207: Refactor MapPolicy classes to also allow a first pass.

Thu Mar 23 17:53:53 UTC 2017

This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit 2f22721425b96d3a8c6616948c7df09fed6beead
Author: Ryan Curtin <ryan at ratml.org>
Date:   Sat Mar 18 13:25:36 2017 -0400

    Refactor MapPolicy classes to also allow a first pass.
    
    This might be necessary for example for the IncrementPolicy class where we must
    know at the outset whether each dimension is categorical or numeric.
---
 .../core/data/map_policies/increment_policy.hpp    | 66 +++++++++++++++++++---
 .../core/data/map_policies/missing_policy.hpp      | 64 ++++++++++++++++-----
 2 files changed, 108 insertions(+), 22 deletions(-)

diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp
index 3c6c010..1676fbf 100644
--- a/src/mlpack/core/data/map_policies/increment_policy.hpp
+++ b/src/mlpack/core/data/map_policies/increment_policy.hpp
@@ -32,6 +32,38 @@ class IncrementPolicy
   // typedef of MappedType
   using MappedType = size_t;
 
+  //! We do need a first pass over the data to set the dimension types right.
+  static const bool NeedsFirstPass = true;
+
+  /**
+   * Determine if the dimension is numeric or categorical.
+   */
+  template<typename T>
+  void MapFirstPass(const std::string& string,
+                    const size_t dim,
+                    std::vector<Datatype>& types)
+  {
+    if (types[dim] == Datatype::categorical)
+    {
+      // No need to check; it's already categorical.
+      return;
+    }
+
+    // Otherwise we need to attempt to read the value.  If the read fails, the
+    // dimension is categorical; otherwise we leave it at the default of
+    // numeric.
+    std::stringstream token;
+    token.str(string);
+    T val;
+    token >> val;
+
+    if (token.fail() || !token.eof())
+    {
+      // Parsing failed; the dimension is categorical.
+      types[dim] = Datatype::categorical;
+    }
+  }
+
   /**
    * Given the string and the dimension to which the it belongs, and the maps
    * and types given by the DatasetMapper class, returns its numeric mapping.
@@ -45,12 +77,32 @@ class IncrementPolicy
    * @param maps Unordered map given by the DatasetMapper.
    * @param types Vector containing the type information about each dimensions.
    */
-  template <typename MapType>
-  MappedType MapString(const std::string& string,
-                       const size_t dimension,
-                       MapType& maps,
-                       std::vector<Datatype>& types)
+  template<typename MapType, typename T>
+  T MapString(const std::string& string,
+              const size_t dimension,
+              MapType& maps,
+              std::vector<Datatype>& types)
   {
+    // If we are in a categorical dimension we already know we need to map.
+    if (types[dimension] == Datatype::numeric)
+    {
+      // Check if this string needs to be mapped or if it can be read
+      // directly as a number.  This will be true if nothing else in this
+      // dimension has yet been mapped, but this can't be read as a number.
+      std::stringstream token;
+      token.str(string);
+      T val;
+      token >> val;
+
+      if (!token.fail() && token.eof())
+      {
+        // We can return what we have.
+        return val;
+      }
+    }
+
+    // The token must be mapped.
+
     // If this condition is true, either we have no mapping for the given string
     // or we have no mappings for the given dimension at all.  In either case,
     // we create a mapping.
@@ -60,13 +112,13 @@ class IncrementPolicy
       // This string does not exist yet.
       size_t& numMappings = maps[dimension].second;
 
-      // change type of the feature to categorical
+      // Change type of the feature to categorical.
       if (numMappings == 0)
         types[dimension] = Datatype::categorical;
 
       typedef boost::bimap<std::string, MappedType>::value_type PairType;
       maps[dimension].first.insert(PairType(string, numMappings));
-      return numMappings++;
+      return T(numMappings++);
     }
     else
     {
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index e8bb115..a87126c 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -52,6 +52,19 @@ class MissingPolicy
     // Nothing to initialize here.
   }
 
+  //! This doesn't need a first pass over the data to set up.
+  static const bool NeedsFirstPass = false;
+
+  /**
+   * There is nothing for us to do here, but this is required by the MapPolicy
+   * type.
+   */
+  template<typename T>
+  void MapFirstPass(const std::string& /* string */, const size_t /* dim */)
+  {
+    // Nothing to do.
+  }
+
   /**
    * Given the string and the dimension to which it belongs by the user, and
    * the maps and types given by the DatasetMapper class, returns its numeric
@@ -66,24 +79,45 @@ class MissingPolicy
    * @param maps Unordered map given by the DatasetMapper.
    * @param types Vector containing the type information about each dimensions.
    */
-  template <typename MapType>
-  MappedType MapString(const std::string& string,
-                       const size_t dimension,
-                       MapType& maps,
-                       std::vector<Datatype>& /* types */)
+  template<typename MapType, typename T>
+  T MapString(const std::string& string,
+              const size_t dimension,
+              MapType& maps,
+              std::vector<Datatype>& /* types */)
   {
-    // Everything is mapped to NaN.  However we must still keep track of
-    // everything that we have mapped, so we add it to the maps if needed.
-    if (maps.count(dimension) == 0 ||
-        maps[dimension].first.left.count(string) == 0)
+    static_assert(std::numeric_limits<T>::has_quiet_NaN == true,
+        "Cannot use MissingPolicy with types where has_quiet_NaN() is false!");
+
+    // If we can load the string then there is no need for mapping.
+    std::stringstream token;
+    token.str(string);
+    T t;
+    token >> t; // Could be sped up by only doing this if we need to.
+
+    // If extraction of the value fails, or if it is a value that is supposed to
+    // be mapped, then do mapping.
+    if (token.fail() || !token.eof() ||
+        missingSet.find(string) != std::end(missingSet))
     {
-      // This string does not exist yet.
-      typedef boost::bimap<std::string, MappedType>::value_type PairType;
-      maps[dimension].first.insert(PairType(string, NaN));
-      maps[dimension].second++;
-    }
+      // Everything is mapped to NaN.  However we must still keep track of
+      // everything that we have mapped, so we add it to the maps if needed.
+      if (maps.count(dimension) == 0 ||
+          maps[dimension].first.left.count(string) == 0)
+      {
+        // This string does not exist yet.
+        typedef boost::bimap<std::string, MappedType>::value_type PairType;
+        maps[dimension].first.insert(PairType(string,
+            std::numeric_limits<MappedType>::quiet_NaN()));
+        maps[dimension].second++;
+      }
 
-    return std::numeric_limits<MappedType>::quiet_NaN();
+      return std::numeric_limits<T>::quiet_NaN();
+    }
+    else
+    {
+      // We can just return the value that we read.
+      return t;
+    }
   }
 
   /**

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git