[mlpack] 141/207: add overload, able to move string

Barak A. Pearlmutter barak+git at pearlmutter.net
Thu Mar 23 17:53:48 UTC 2017


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit 3aa3823536c9eb453ce265b2c10fd660f6102d70
Author: stereomatchingkiss <stereomatchingkiss at gmail.com>
Date:   Sat Jun 4 19:25:20 2016 +0800

    add overload, able to move string
---
 src/mlpack/core/data/dataset_info.hpp      | 124 +++++++++++++++++++++++++++++
 src/mlpack/core/data/dataset_info_impl.hpp | 101 +++++++++++++++++++++++
 2 files changed, 225 insertions(+)

diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
new file mode 100644
index 0000000..7406b45
--- /dev/null
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -0,0 +1,124 @@
+/**
+ * @file dataset_info.hpp
+ * @author Ryan Curtin
+ *
+ * Defines the DatasetInfo class, which holds information about a dataset.  This
+ * is useful when the dataset contains categorical non-numeric features that
+ * needs to be mapped to categorical numeric features.
+ */
+#ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP
+#define MLPACK_CORE_DATA_DATASET_INFO_HPP
+
+#include <mlpack/core.hpp>
+#include <unordered_map>
+#include <boost/bimap.hpp>
+
+namespace mlpack {
+namespace data {
+
+/**
+ * The Datatype enum specifies the types of data mlpack algorithms can use.  The
+ * vast majority of mlpack algorithms can only use numeric data (i.e.
+ * float/double/etc.), but some algorithms can use categorical data, specified
+ * via this Datatype enum and the DatasetInfo class.
+ */
+enum Datatype : bool /* bool is all the precision we need for two types */
+{
+  numeric = 0,
+  categorical = 1
+};
+
+/**
+ * Auxiliary information for a dataset, including mappings to/from strings and
+ * the datatype of each dimension.  DatasetInfo objects are optionally produced
+ * by data::Load(), and store the type of each dimension (Datatype::numeric or
+ * Datatype::categorical) as well as mappings from strings to unsigned integers
+ * and vice versa.
+ */
+class DatasetInfo
+{
+ public:
+  /**
+   * Create the DatasetInfo object with the given dimensionality.  Note that the
+   * dimensionality cannot be changed later; you will have to create a new
+   * DatasetInfo object.
+   */
+  DatasetInfo(const size_t dimensionality = 0);
+
+  /**
+   * Given the string and the dimension to which it belongs, return its numeric
+   * mapping.  If no mapping yet exists, the string is added to the list of
+   * mappings for the given dimension.  The dimension parameter refers to the
+   * index of the dimension of the string (i.e. the row in the dataset).
+   *
+   * @param string String to find/create mapping for.
+   * @param dimension Index of the dimension of the string.
+   */  
+  size_t MapString(const std::string &string, const size_t dimension)
+  {
+    return MapString(string, dimension);
+  }
+
+  size_t MapString(std::string &&string, const size_t dimension)
+  {
+    return MapString(std::move(string), dimension);
+  }
+
+  /**
+   * Return the string that corresponds to a given value in a given dimension.
+   * If the string is not a valid mapping in the given dimension, a
+   * std::invalid_argument is thrown.
+   *
+   * @param value Mapped value for string.
+   * @param dimension Dimension to unmap string from.
+   */
+  const std::string& UnmapString(const size_t value, const size_t dimension);
+
+  //! Return the type of a given dimension (numeric or categorical).
+  Datatype Type(const size_t dimension) const;
+  //! Modify the type of a given dimension (be careful!).
+  Datatype& Type(const size_t dimension);
+
+  /**
+   * Get the number of mappings for a particular dimension.  If the dimension
+   * is numeric, then this will return 0.
+   */
+  size_t NumMappings(const size_t dimension) const;
+
+  /**
+   * Get the dimensionality of the DatasetInfo object (that is, how many
+   * dimensions it has information for).  If this object was created by a call
+   * to mlpack::data::Load(), then the dimensionality will be the same as the
+   * number of rows (dimensions) in the dataset.
+   */
+  size_t Dimensionality() const;
+
+  /**
+   * Serialize the dataset information.
+   */
+  template<typename Archive>
+  void Serialize(Archive& ar, const unsigned int /* version */)
+  {
+    ar & data::CreateNVP(types, "types");
+    ar & data::CreateNVP(maps, "maps");
+  }
+
+ private:    
+  //! Types of each dimension.
+  std::vector<Datatype> types;
+
+  //! Mappings from strings to integers.  Map entries will only exist for
+  //! dimensions that are categorical.
+  std::unordered_map<size_t, std::pair<boost::bimap<std::string, size_t>,
+      size_t>> maps;
+
+  template<typename T>
+  size_t MapString(T&& string, const size_t dimension);
+};
+
+} // namespace data
+} // namespace mlpack
+
+#include "dataset_info_impl.hpp"
+
+#endif
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
new file mode 100644
index 0000000..ed20f06
--- /dev/null
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -0,0 +1,101 @@
+/**
+ * @file dataset_info_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the DatasetInfo class.
+ */
+#ifndef MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
+#define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
+
+// In case it hasn't already been included.
+#include "dataset_info.hpp"
+
+namespace mlpack {
+namespace data {
+
+// Default constructor.
+inline DatasetInfo::DatasetInfo(const size_t dimensionality) :
+    types(dimensionality, Datatype::numeric)
+{
+  // Nothing to initialize.
+}
+
+// Map the string to a numeric id.
+template<typename T>
+inline size_t DatasetInfo::MapString(T&& string,
+                                     const size_t dimension)
+{
+  // If this condition is true, either we have no mapping for the given string
+  // or we have no mappings for the given dimension at all.  In either case,
+  // we create a mapping.
+  if (maps.count(dimension) == 0 ||
+      maps[dimension].first.left.count(string) == 0)
+  {
+    // This string does not exist yet.
+    size_t& numMappings = maps[dimension].second;
+    if (numMappings == 0)
+      types[dimension] = Datatype::categorical;
+    typedef boost::bimap<std::string, size_t>::value_type PairType;
+    maps[dimension].first.insert(PairType(std::forward<T>(string), numMappings));
+    return numMappings++;
+  }
+  else
+  {
+    // This string already exists in the mapping.
+    return maps[dimension].first.left.at(std::forward<T>(string));
+  }
+}
+
+// Return the string corresponding to a value in a given dimension.
+inline const std::string& DatasetInfo::UnmapString(
+    const size_t value,
+    const size_t dimension)
+{
+  // Throw an exception if the value doesn't exist.
+  if (maps[dimension].first.right.count(value) == 0)
+  {
+    std::ostringstream oss;
+    oss << "DatasetInfo::UnmapString(): value '" << value << "' unknown for "
+        << "dimension " << dimension;
+    throw std::invalid_argument(oss.str());
+  }
+
+  return maps[dimension].first.right.at(value);
+}
+
+// Get the type of a particular dimension.
+inline Datatype DatasetInfo::Type(const size_t dimension) const
+{
+  if (dimension >= types.size())
+  {
+    std::ostringstream oss;
+    oss << "requested type of dimension " << dimension << ", but dataset only "
+        << "has " << types.size() << " dimensions";
+    throw std::invalid_argument(oss.str());
+  }
+
+  return types[dimension];
+}
+
+inline Datatype& DatasetInfo::Type(const size_t dimension)
+{
+  if (dimension >= types.size())
+    types.resize(dimension + 1, Datatype::numeric);
+
+  return types[dimension];
+}
+
+inline size_t DatasetInfo::NumMappings(const size_t dimension) const
+{
+  return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second;
+}
+
+inline size_t DatasetInfo::Dimensionality() const
+{
+  return types.size();
+}
+
+} // namespace data
+} // namespace mlpack
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list