[mlpack] 58/207: Add documentation.

Barak A. Pearlmutter barak+git at pearlmutter.net
Thu Mar 23 17:53:40 UTC 2017


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit 6069db91cfc190757cdb9f456bf7584bba2634da
Author: Ryan Curtin <ryan at ratml.org>
Date:   Thu Jan 26 10:44:46 2017 -0500

    Add documentation.
---
 .../decision_tree/all_categorical_split.hpp        | 28 +++++++++++
 .../decision_tree/best_binary_numeric_split.hpp    | 26 ++++++++++
 src/mlpack/methods/decision_tree/decision_tree.hpp | 56 ++++++++++++++++++++--
 src/mlpack/methods/decision_tree/gini_gain.hpp     |  5 +-
 .../methods/decision_tree/information_gain.hpp     | 17 ++++---
 5 files changed, 120 insertions(+), 12 deletions(-)

diff --git a/src/mlpack/methods/decision_tree/all_categorical_split.hpp b/src/mlpack/methods/decision_tree/all_categorical_split.hpp
index 4999627..23af2c8 100644
--- a/src/mlpack/methods/decision_tree/all_categorical_split.hpp
+++ b/src/mlpack/methods/decision_tree/all_categorical_split.hpp
@@ -13,6 +13,12 @@
 namespace mlpack {
 namespace tree {
 
+/**
+ * The AllCategoricalSplit is a splitting function that will split categorical
+ * features into many children: one child for each category.
+ *
+ * @tparam FitnessFunction Fitness function to evaluate gain with.
+ */
 template<typename FitnessFunction>
 class AllCategoricalSplit
 {
@@ -27,6 +33,19 @@ class AllCategoricalSplit
    * return the value 'bestGain'.  If a split is made, then classProbabilities
    * and aux may be modified.  For this particular split type, aux will be empty
    * and classProbabilities will hold one element---the number of children.
+   *
+   * @param bestGain Best gain seen so far (we'll only split if we find gain
+   *      better than this).
+   * @param data The dimension of data points to check for a split in.
+   * @param numCategories Number of categories in the categorical data.
+   * @param labels Labels for each point.
+   * @param numClasses Number of classes in the dataset.
+   * @param minimumLeafSize Minimum number of points in a leaf node for
+   *      splitting.
+   * @param classProbabilities Class probabilities vector, which may be filled
+   *      with split information a successful split.
+   * @param aux Auxiliary split information, which may be modified on a
+   *      successful split.
    */
   template<typename VecType>
   static double SplitIfBetter(
@@ -41,11 +60,20 @@ class AllCategoricalSplit
 
   /**
    * Return the number of children in the split.
+   *
+   * @param classProbabilities Auxiliary information for the split.
+   * @param aux (Unused) auxiliary information for the split.
    */
   template<typename ElemType>
   static size_t NumChildren(const arma::Col<ElemType>& classProbabilities,
                             const AuxiliarySplitInfo<ElemType>& /* aux */);
 
+  /**
+   * Calculate the direction a point should percolate to.
+   *
+   * @param classProbabilities Auxiliary information for the split.
+   * @param aux (Unused) auxiliary information for the split.
+   */
   template<typename ElemType>
   static size_t CalculateDirection(
       const ElemType& point,
diff --git a/src/mlpack/methods/decision_tree/best_binary_numeric_split.hpp b/src/mlpack/methods/decision_tree/best_binary_numeric_split.hpp
index 3d71229..254bdae 100644
--- a/src/mlpack/methods/decision_tree/best_binary_numeric_split.hpp
+++ b/src/mlpack/methods/decision_tree/best_binary_numeric_split.hpp
@@ -12,6 +12,12 @@
 namespace mlpack {
 namespace tree {
 
+/**
+ * The BestBinaryNumericSplit is a splitting function for decision trees that
+ * will exhaustively search a numeric dimension for the best binary split.
+ *
+ * @tparam FitnessFunction Fitness function to use to calculate gain.
+ */
 template<typename FitnessFunction>
 class BestBinaryNumericSplit
 {
@@ -25,6 +31,19 @@ class BestBinaryNumericSplit
    * improves on 'bestGain', then we return the improved gain.  Otherwise we
    * return the value 'bestGain'.  If a split is made, then classProbabilities
    * and aux may be modified.
+   *
+   * @param bestGain Best gain seen so far (we'll only split if we find gain
+   *      better than this).
+   * @param data The dimension of data points to check for a split in.
+   * @param numCategories Number of categories in the categorical data.
+   * @param labels Labels for each point.
+   * @param numClasses Number of classes in the dataset.
+   * @param minimumLeafSize Minimum number of points in a leaf node for
+   *      splitting.
+   * @param classProbabilities Class probabilities vector, which may be filled
+   *      with split information a successful split.
+   * @param aux Auxiliary split information, which may be modified on a
+   *      successful split.
    */
   template<typename VecType>
   static double SplitIfBetter(
@@ -46,6 +65,13 @@ class BestBinaryNumericSplit
     return 2;
   }
 
+  /**
+   * Given a point, calculate which child it should go to (left or right).
+   *
+   * @param point Point to calculate direction of.
+   * @param classProbabilities Auxiliary information for the split.
+   * @param aux (Unused) auxiliary information for the split.
+   */
   template<typename ElemType>
   static size_t CalculateDirection(
       const ElemType& point,
diff --git a/src/mlpack/methods/decision_tree/decision_tree.hpp b/src/mlpack/methods/decision_tree/decision_tree.hpp
index 3503406..f42e3b6 100644
--- a/src/mlpack/methods/decision_tree/decision_tree.hpp
+++ b/src/mlpack/methods/decision_tree/decision_tree.hpp
@@ -41,7 +41,16 @@ class DecisionTree :
   typedef CategoricalSplitType<FitnessFunction> CategoricalSplit;
 
   /**
-   * Construct the decision tree on the given data and labels.
+   * Construct the decision tree on the given data and labels, where the data
+   * can be both numeric and categorical.  Setting minimumLeafSize too small may
+   * cause the tree to overfit, but setting it too large may cause it to
+   * underfit.
+   *
+   * @param data Dataset to train on.
+   * @param datasetInfo Type information for each dimension of the dataset.
+   * @param labels Labels for each training point.
+   * @param numClasses Number of classes in the dataset.
+   * @param minimumLeafSize Minimum number of points in each leaf node.
    */
   template<typename MatType>
   DecisionTree(const MatType& data,
@@ -52,7 +61,14 @@ class DecisionTree :
 
   /**
    * Construct the decision tree on the given data and labels, assuming that the
-   * data is all of the numeric type.
+   * data is all of the numeric type.  Setting minimumLeafSize too small may
+   * cause the tree to overfit, but setting it too large may cause it to
+   * underfit.
+   *
+   * @param data Dataset to train on.
+   * @param labels Labels for each training point.
+   * @param numClasses Number of classes in the dataset.
+   * @param minimumLeafSize Minimum number of points in each leaf node.
    */
   template<typename MatType>
   DecisionTree(const MatType& data,
@@ -63,6 +79,8 @@ class DecisionTree :
   /**
    * Construct a decision tree without training it.  It will be a leaf node with
    * equal probabilities for each class.
+   *
+   * @param numClasses Number of classes in the dataset.
    */
   DecisionTree(const size_t numClasses = 1);
 
@@ -103,7 +121,15 @@ class DecisionTree :
 
   /**
    * Train the decision tree on the given data.  This will overwrite the
-   * existing model.
+   * existing model.  The data may have numeric and categorical types, specified
+   * by the datasetInfo parameter.  Setting minimumLeafSize too small may cause
+   * the tree to overfit, but setting it too large may cause it to underfit.
+   *
+   * @param data Dataset to train on.
+   * @param datasetInfo Type information for each dimension.
+   * @param labels Labels for each training point.
+   * @param numClasses Number of classes in the dataset.
+   * @param minimumLeafSize Minimum number of points in each leaf node.
    */
   template<typename MatType>
   void Train(const MatType& data,
@@ -114,7 +140,14 @@ class DecisionTree :
 
   /**
    * Train the decision tree on the given data, assuming that all dimensions are
-   * numeric.  This will overwrite the given model.
+   * numeric.  This will overwrite the given model.  Setting minimumLeafSize too
+   * small may cause the tree to overfit, but setting it too large may cause it
+   * to underfit.
+   *
+   * @param data Dataset to train on.
+   * @param labels Labels for each training point.
+   * @param numClasses Number of classes in the dataset.
+   * @param minimumLeafSize Minimum number of points in each leaf node.
    */
   template<typename MatType>
   void Train(const MatType& data,
@@ -125,6 +158,8 @@ class DecisionTree :
   /**
    * Classify the given point, using the entire tree.  The predicted label is
    * returned.
+   *
+   * @param point Point to classify.
    */
   template<typename VecType>
   size_t Classify(const VecType& point) const;
@@ -132,6 +167,11 @@ class DecisionTree :
   /**
    * Classify the given point and also return estimates of the probability for
    * each class in the given vector.
+   *
+   * @param point Point to classify.
+   * @param prediction This will be set to the predicted class of the point.
+   * @param probabilities This will be filled with class probabilities for the
+   *      point.
    */
   template<typename VecType>
   void Classify(const VecType& point,
@@ -141,6 +181,9 @@ class DecisionTree :
   /**
    * Classify the given points, using the entire tree.  The predicted labels for
    * each point are stored in the given vector.
+   *
+   * @param data Set of points to classify.
+   * @param predictions This will be filled with predictions for each point.
    */
   template<typename MatType>
   void Classify(const MatType& data,
@@ -150,6 +193,11 @@ class DecisionTree :
    * Classify the given points and also return estimates of the probabilities
    * for each class in the given matrix.  The predicted labels for each point
    * are stored in the given vector.
+   *
+   * @param data Set of points to classify.
+   * @param predictions This will be filled with predictions for each point.
+   * @param probabilities This will be filled with class probabilities for each
+   *      point.
    */
   template<typename MatType>
   void Classify(const MatType& data,
diff --git a/src/mlpack/methods/decision_tree/gini_gain.hpp b/src/mlpack/methods/decision_tree/gini_gain.hpp
index e075c87..c1f08da 100644
--- a/src/mlpack/methods/decision_tree/gini_gain.hpp
+++ b/src/mlpack/methods/decision_tree/gini_gain.hpp
@@ -2,7 +2,7 @@
  * @file gini_gain.hpp
  * @author Ryan Curtin
  *
- * The GiniImpurity class, which is a fitness function (FitnessFunction) for
+ * The GiniGain class, which is a fitness function (FitnessFunction) for
  * decision trees.
  *
  * mlpack is free software; you may redistribute it and/or modify it under the
@@ -32,6 +32,7 @@ class GiniGain
    * an Armadillo vector that holds size_t objects.
    *
    * @param labels Set of labels to evaluate Gini impurity on.
+   * @param numClasses Number of classes in the dataset.
    */
   template<typename RowType>
   static double Evaluate(const RowType& labels,
@@ -61,6 +62,8 @@ class GiniGain
    * Return the range of the Gini impurity for the given number of classes.
    * (That is, the difference between the maximum possible value and the minimum
    * possible value.)
+   *
+   * @param numClasses Number of classes in the dataset.
    */
   static double Range(const size_t numClasses)
   {
diff --git a/src/mlpack/methods/decision_tree/information_gain.hpp b/src/mlpack/methods/decision_tree/information_gain.hpp
index 85eef7e..2dbf814 100644
--- a/src/mlpack/methods/decision_tree/information_gain.hpp
+++ b/src/mlpack/methods/decision_tree/information_gain.hpp
@@ -3,7 +3,7 @@
  * @author Ryan Curtin
  *
  * An implementation of information gain, which can be used in place of Gini
- * impurity.
+ * gain.
  *
  * mlpack is free software; you may redistribute it and/or modify it under the
  * terms of the 3-clause BSD license.  You should have received a copy of the
@@ -18,17 +18,18 @@
 namespace mlpack {
 namespace tree {
 
+/**
+ * The standard information gain criterion, used for calculating gain in
+ * decision trees.
+ */
 class InformationGain
 {
  public:
   /**
-   * Given the sufficient statistics of a proposed split, calculate the
-   * information gain if that split was to be used.  The 'counts' matrix should
-   * contain the number of points in each class in each column, so the size of
-   * 'counts' is children x classes, where 'children' is the number of child
-   * nodes in the proposed split.
+   * Given a set of labels, calculate the information gain of those labels.
    *
-   * @param counts Matrix of sufficient statistics.
+   * @param labels Labels of the dataset.
+   * @param numClasses Number of classes in the dataset.
    */
   static double Evaluate(const arma::Row<size_t>& labels,
                          const size_t numClasses)
@@ -59,6 +60,8 @@ class InformationGain
    * Return the range of the information gain for the given number of classes.
    * (That is, the difference between the maximum possible value and the minimum
    * possible value.)
+   *
+   * @param numClasses Number of classes in the dataset.
    */
   static double Range(const size_t numClasses)
   {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list