[mlpack] 141/207: add overload, able to move string
Barak A. Pearlmutter
barak+git at pearlmutter.net
Thu Mar 23 17:53:48 UTC 2017
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch master
in repository mlpack.
commit 3aa3823536c9eb453ce265b2c10fd660f6102d70
Author: stereomatchingkiss <stereomatchingkiss at gmail.com>
Date: Sat Jun 4 19:25:20 2016 +0800
add overload, able to move string
---
src/mlpack/core/data/dataset_info.hpp | 124 +++++++++++++++++++++++++++++
src/mlpack/core/data/dataset_info_impl.hpp | 101 +++++++++++++++++++++++
2 files changed, 225 insertions(+)
diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
new file mode 100644
index 0000000..7406b45
--- /dev/null
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -0,0 +1,124 @@
+/**
+ * @file dataset_info.hpp
+ * @author Ryan Curtin
+ *
+ * Defines the DatasetInfo class, which holds information about a dataset. This
+ * is useful when the dataset contains categorical non-numeric features that
+ * needs to be mapped to categorical numeric features.
+ */
+#ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP
+#define MLPACK_CORE_DATA_DATASET_INFO_HPP
+
+#include <mlpack/core.hpp>
+#include <unordered_map>
+#include <boost/bimap.hpp>
+
+namespace mlpack {
+namespace data {
+
+/**
+ * The Datatype enum specifies the types of data mlpack algorithms can use. The
+ * vast majority of mlpack algorithms can only use numeric data (i.e.
+ * float/double/etc.), but some algorithms can use categorical data, specified
+ * via this Datatype enum and the DatasetInfo class.
+ */
+enum Datatype : bool /* bool is all the precision we need for two types */
+{
+ numeric = 0,
+ categorical = 1
+};
+
+/**
+ * Auxiliary information for a dataset, including mappings to/from strings and
+ * the datatype of each dimension. DatasetInfo objects are optionally produced
+ * by data::Load(), and store the type of each dimension (Datatype::numeric or
+ * Datatype::categorical) as well as mappings from strings to unsigned integers
+ * and vice versa.
+ */
+class DatasetInfo
+{
+ public:
+ /**
+ * Create the DatasetInfo object with the given dimensionality. Note that the
+ * dimensionality cannot be changed later; you will have to create a new
+ * DatasetInfo object.
+ */
+ DatasetInfo(const size_t dimensionality = 0);
+
+ /**
+ * Given the string and the dimension to which it belongs, return its numeric
+ * mapping. If no mapping yet exists, the string is added to the list of
+ * mappings for the given dimension. The dimension parameter refers to the
+ * index of the dimension of the string (i.e. the row in the dataset).
+ *
+ * @param string String to find/create mapping for.
+ * @param dimension Index of the dimension of the string.
+ */
+ size_t MapString(const std::string &string, const size_t dimension)
+ {
+ return MapString(string, dimension);
+ }
+
+ size_t MapString(std::string &&string, const size_t dimension)
+ {
+ return MapString(std::move(string), dimension);
+ }
+
+ /**
+ * Return the string that corresponds to a given value in a given dimension.
+ * If the string is not a valid mapping in the given dimension, a
+ * std::invalid_argument is thrown.
+ *
+ * @param value Mapped value for string.
+ * @param dimension Dimension to unmap string from.
+ */
+ const std::string& UnmapString(const size_t value, const size_t dimension);
+
+ //! Return the type of a given dimension (numeric or categorical).
+ Datatype Type(const size_t dimension) const;
+ //! Modify the type of a given dimension (be careful!).
+ Datatype& Type(const size_t dimension);
+
+ /**
+ * Get the number of mappings for a particular dimension. If the dimension
+ * is numeric, then this will return 0.
+ */
+ size_t NumMappings(const size_t dimension) const;
+
+ /**
+ * Get the dimensionality of the DatasetInfo object (that is, how many
+ * dimensions it has information for). If this object was created by a call
+ * to mlpack::data::Load(), then the dimensionality will be the same as the
+ * number of rows (dimensions) in the dataset.
+ */
+ size_t Dimensionality() const;
+
+ /**
+ * Serialize the dataset information.
+ */
+ template<typename Archive>
+ void Serialize(Archive& ar, const unsigned int /* version */)
+ {
+ ar & data::CreateNVP(types, "types");
+ ar & data::CreateNVP(maps, "maps");
+ }
+
+ private:
+ //! Types of each dimension.
+ std::vector<Datatype> types;
+
+ //! Mappings from strings to integers. Map entries will only exist for
+ //! dimensions that are categorical.
+ std::unordered_map<size_t, std::pair<boost::bimap<std::string, size_t>,
+ size_t>> maps;
+
+ template<typename T>
+ size_t MapString(T&& string, const size_t dimension);
+};
+
+} // namespace data
+} // namespace mlpack
+
+#include "dataset_info_impl.hpp"
+
+#endif
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
new file mode 100644
index 0000000..ed20f06
--- /dev/null
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -0,0 +1,101 @@
+/**
+ * @file dataset_info_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the DatasetInfo class.
+ */
+#ifndef MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
+#define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
+
+// In case it hasn't already been included.
+#include "dataset_info.hpp"
+
+namespace mlpack {
+namespace data {
+
+// Default constructor.
+inline DatasetInfo::DatasetInfo(const size_t dimensionality) :
+ types(dimensionality, Datatype::numeric)
+{
+ // Nothing to initialize.
+}
+
+// Map the string to a numeric id.
+template<typename T>
+inline size_t DatasetInfo::MapString(T&& string,
+ const size_t dimension)
+{
+ // If this condition is true, either we have no mapping for the given string
+ // or we have no mappings for the given dimension at all. In either case,
+ // we create a mapping.
+ if (maps.count(dimension) == 0 ||
+ maps[dimension].first.left.count(string) == 0)
+ {
+ // This string does not exist yet.
+ size_t& numMappings = maps[dimension].second;
+ if (numMappings == 0)
+ types[dimension] = Datatype::categorical;
+ typedef boost::bimap<std::string, size_t>::value_type PairType;
+ maps[dimension].first.insert(PairType(std::forward<T>(string), numMappings));
+ return numMappings++;
+ }
+ else
+ {
+ // This string already exists in the mapping.
+ return maps[dimension].first.left.at(std::forward<T>(string));
+ }
+}
+
+// Return the string corresponding to a value in a given dimension.
+inline const std::string& DatasetInfo::UnmapString(
+ const size_t value,
+ const size_t dimension)
+{
+ // Throw an exception if the value doesn't exist.
+ if (maps[dimension].first.right.count(value) == 0)
+ {
+ std::ostringstream oss;
+ oss << "DatasetInfo::UnmapString(): value '" << value << "' unknown for "
+ << "dimension " << dimension;
+ throw std::invalid_argument(oss.str());
+ }
+
+ return maps[dimension].first.right.at(value);
+}
+
+// Get the type of a particular dimension.
+inline Datatype DatasetInfo::Type(const size_t dimension) const
+{
+ if (dimension >= types.size())
+ {
+ std::ostringstream oss;
+ oss << "requested type of dimension " << dimension << ", but dataset only "
+ << "has " << types.size() << " dimensions";
+ throw std::invalid_argument(oss.str());
+ }
+
+ return types[dimension];
+}
+
+inline Datatype& DatasetInfo::Type(const size_t dimension)
+{
+ if (dimension >= types.size())
+ types.resize(dimension + 1, Datatype::numeric);
+
+ return types[dimension];
+}
+
+inline size_t DatasetInfo::NumMappings(const size_t dimension) const
+{
+ return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second;
+}
+
+inline size_t DatasetInfo::Dimensionality() const
+{
+ return types.size();
+}
+
+} // namespace data
+} // namespace mlpack
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list