[mlpack] 59/207: Revert changes to decision stumps that I didn't mean to push.

Barak A. Pearlmutter barak+git at pearlmutter.net
Thu Mar 23 17:53:40 UTC 2017


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit a2679d690529ed36f4529d31c1830cabad51dee2
Author: Ryan Curtin <ryan at ratml.org>
Date:   Thu Feb 9 11:07:44 2017 -0500

    Revert changes to decision stumps that I didn't mean to push.
---
 .../methods/decision_stump/decision_stump.hpp      |  95 ++----
 .../methods/decision_stump/decision_stump_impl.hpp | 370 +++++----------------
 src/mlpack/tests/adaboost_test.cpp                 |  15 +-
 src/mlpack/tests/decision_stump_test.cpp           | 237 ++-----------
 src/mlpack/tests/serialization_test.cpp            |   8 +-
 5 files changed, 126 insertions(+), 599 deletions(-)

diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index 0d36bfa..5918aaa 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -29,11 +29,8 @@ namespace decision_stump {
  * Points that are below the first bin will take the label of the first bin.
  *
  * @tparam MatType Type of matrix that is being used (sparse or dense).
- * @tparam NoRecursion If true, this will create a stump (a one-level decision
- *       tree).
  */
-template<typename MatType = arma::mat,
-         bool NoRecursion = true>
+template<typename MatType = arma::mat>
 class DecisionStump
 {
  public:
@@ -75,39 +72,6 @@ class DecisionStump
   DecisionStump();
 
   /**
-   * Copy the given decision stump.
-   *
-   * @param other Decision stump to copy.
-   */
-  DecisionStump(const DecisionStump& other);
-
-  /**
-   * Take ownership of the given decision stump.
-   *
-   * @param other Decision stump to take ownership of.
-   */
-  DecisionStump(DecisionStump&& other);
-
-  /**
-   * Copy the given decision stump.
-   *
-   * @param other Decision stump to copy.
-   */
-  DecisionStump& operator=(const DecisionStump& other);
-
-  /**
-   * Take ownership of the given decision stump.
-   *
-   * @param other Decision stump to take ownership of.
-   */
-  DecisionStump& operator=(DecisionStump&& other);
-
-  /**
-   * Destroy the decision stump.
-   */
-  ~DecisionStump();
-
-  /**
    * Train the decision stump on the given data.  This completely overwrites any
    * previous training data, so after training the stump may be completely
    * different.
@@ -133,57 +97,36 @@ class DecisionStump
   void Classify(const MatType& test, arma::Row<size_t>& predictedLabels);
 
   //! Access the splitting dimension.
-  size_t SplitDimension() const { return splitDimensionOrLabel; }
+  size_t SplitDimension() const { return splitDimension; }
   //! Modify the splitting dimension (be careful!).
-  size_t& SplitDimension() { return splitDimensionOrLabel; }
+  size_t& SplitDimension() { return splitDimension; }
 
   //! Access the splitting values.
-  const arma::vec& Split() const { return splitOrClassProbs; }
+  const arma::vec& Split() const { return split; }
   //! Modify the splitting values (be careful!).
-  arma::vec& Split() { return splitOrClassProbs; }
-
-  //! Access the label.
-  size_t Label() const { return splitDimensionOrLabel; }
-  //! Modify the label.
-  size_t& Label() { return splitDimensionOrLabel; }
-
-  //! Get the number of children.
-  size_t NumChildren() const { return children.size(); }
+  arma::vec& Split() { return split; }
 
-  //! Access the given child.
-  const DecisionStump& Child(const size_t i) const { return *children[i]; }
-  //! Modify the given child.
-  DecisionStump& Child(const size_t i) { return *children[i]; }
+  //! Access the labels for each split bin.
+  const arma::Col<size_t> BinLabels() const { return binLabels; }
+  //! Modify the labels for each split bin (be careful!).
+  arma::Col<size_t>& BinLabels() { return binLabels; }
 
   //! Serialize the decision stump.
   template<typename Archive>
   void Serialize(Archive& ar, const unsigned int /* version */);
 
  private:
-  /**
-   * Construct a leaf with the given probabilities and class label.
-   *
-   * @param bucketSize Bucket size for training.
-   * @param label Majority label of leaf.
-   * @param probabilities Class probabilities of leaf.
-   */
-  DecisionStump(const size_t bucketSize,
-                const size_t label,
-                arma::vec&& probabilities);
-
-  //! The number of classes in the model.
+  //! The number of classes (we must store this for boosting).
   size_t classes;
-  //! The minimum number of points in a bucket (training parameter).
+  //! The minimum number of points in a bucket.
   size_t bucketSize;
 
-  //! Stores the value of the dimension on which to split, or the label.
-  size_t splitDimensionOrLabel;
-  //! Stores either the splitting values after training, or the class
-  //! probabilities.
-  arma::vec splitOrClassProbs;
-
-  //! Stores the children (if any).
-  std::vector<DecisionStump*> children;
+  //! Stores the value of the dimension on which to split.
+  size_t splitDimension;
+  //! Stores the splitting values after training.
+  arma::vec split;
+  //! Stores the labels for each splitting bin.
+  arma::Col<size_t> binLabels;
 
   /**
    * Sets up dimension as if it were splitting on it and finds entropy when
@@ -205,8 +148,8 @@ class DecisionStump
    * @tparam dimension dimension is the dimension decided by the constructor
    *      on which we now train the decision stump.
    */
-  void TrainOnDim(const MatType& data,
-                  const size_t dimension,
+  template<typename VecType>
+  void TrainOnDim(const VecType& dimension,
                   const arma::Row<size_t>& labels);
 
   /**
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index c1b37d9..aa7201a 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -9,6 +9,7 @@
  * 3-clause BSD license along with mlpack.  If not, see
  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
  */
+
 #ifndef MLPACK_METHODS_DECISION_STUMP_DECISION_STUMP_IMPL_HPP
 #define MLPACK_METHODS_DECISION_STUMP_DECISION_STUMP_IMPL_HPP
 
@@ -26,12 +27,11 @@ namespace decision_stump {
  * @param classes Number of distinct classes in labels.
  * @param bucketSize Minimum size of bucket when splitting.
  */
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>::DecisionStump(
-    const MatType& data,
-    const arma::Row<size_t>& labels,
-    const size_t classes,
-    const size_t bucketSize) :
+template<typename MatType>
+DecisionStump<MatType>::DecisionStump(const MatType& data,
+                                      const arma::Row<size_t>& labels,
+                                      const size_t classes,
+                                      const size_t bucketSize) :
     classes(classes),
     bucketSize(bucketSize)
 {
@@ -42,143 +42,26 @@ DecisionStump<MatType, NoRecursion>::DecisionStump(
 /**
  * Empty constructor.
  */
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>::DecisionStump() :
+template<typename MatType>
+DecisionStump<MatType>::DecisionStump() :
     classes(1),
     bucketSize(0),
-    splitDimensionOrLabel(0),
-    splitOrClassProbs(1)
-{
-  splitOrClassProbs[0] = 1.0;
-  if (NoRecursion)
-  {
-    // Make a fake stump by creating two children.  We create two and not one so
-    // that we can be guaranteed that splitOrClassProbs has at least one
-    // element.  The children are identical in functionality though.  These fake
-    // children are necessary, because Predict() depends on a stump having
-    // children.
-    children.push_back(new DecisionStump(0, 0, std::move(arma::vec("1.0"))));
-    children.push_back(new DecisionStump(0, 0, std::move(arma::vec("1.0"))));
-  }
-}
-
-// Copy constructor.
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>::DecisionStump(const DecisionStump& other) :
-    classes(other.classes),
-    bucketSize(other.bucketSize),
-    splitDimensionOrLabel(other.splitDimensionOrLabel),
-    splitOrClassProbs(other.splitOrClassProbs)
+    splitDimension(0),
+    split(1),
+    binLabels(1)
 {
-  for (size_t i = 0; i < other.children.size(); ++i)
-    children.push_back(new DecisionStump(*other.children[i]));
-}
-
-// Move constructor.
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>::DecisionStump(DecisionStump&& other) :
-    classes(other.classes),
-    bucketSize(other.bucketSize),
-    splitDimensionOrLabel(other.splitDimensionOrLabel),
-    splitOrClassProbs(std::move(other.splitOrClassProbs)),
-    children(std::move(other.children))
-{
-  // Reset the other one.
-  other.classes = 1;
-  other.bucketSize = 0;
-  other.splitDimensionOrLabel = 0;
-  other.splitOrClassProbs.ones(1);
-  if (NoRecursion)
-  {
-    // Make a fake stump by creating two children.  We create two and not one so
-    // that we can be guaranteed that splitOrClassProbs has at least one
-    // element.  The children are identical in functionality though.  These fake
-    // children are necessary, because Predict() depends on a stump having
-    // children.
-    other.children.push_back(new DecisionStump(0, 0,
-        std::move(arma::vec("1.0"))));
-    other.children.push_back(new DecisionStump(0, 0,
-        std::move(arma::vec("1.0"))));
-  }
-}
-
-// Copy assignment operator.
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>&
-DecisionStump<MatType, NoRecursion>::operator=(const DecisionStump& other)
-{
-  // Clear existing memory.
-  for (size_t i = 0; i < children.size(); ++i)
-    delete children[i];
-  children.clear();
-
-  classes = other.classes;
-  bucketSize = other.bucketSize;
-  splitDimensionOrLabel = other.splitDimensionOrLabel;
-  splitOrClassProbs = other.splitOrClassProbs;
-
-  // Create copies of the children.
-  for (size_t i = 0; i < other.children.size(); ++i)
-    children.push_back(new DecisionStump(*other.children[i]));
-
-  return *this;
-}
-
-// Move assignment operator.
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>&
-DecisionStump<MatType, NoRecursion>::operator=(DecisionStump&& other)
-{
-  // Clear existing memory.
-  for (size_t i = 0; i < children.size(); ++i)
-    delete children[i];
-  children.clear();
-
-  classes = other.classes;
-  bucketSize = other.bucketSize;
-  splitDimensionOrLabel = other.splitDimensionOrLabel;
-  splitOrClassProbs = std::move(other.splitOrClassProbs);
-  children = std::move(other.children);
-
-  // Clear and reinitialize other object.
-  other.classes = 1;
-  other.bucketSize = 0;
-  other.splitDimensionOrLabel = 0;
-  other.splitOrClassProbs.ones(1);
-  if (NoRecursion)
-  {
-    // Make a fake stump by creating two children.  We create two and not one so
-    // that we can be guaranteed that splitOrClassProbs has at least one
-    // element.  The children are identical in functionality though.  These fake
-    // children are necessary, because Predict() depends on a stump having
-    // children.
-    other.children.push_back(new DecisionStump(0, 0,
-        std::move(arma::vec("1.0"))));
-    other.children.push_back(new DecisionStump(0, 0,
-        std::move(arma::vec("1.0"))));
-  }
-
-  return *this;
-}
-
-// Destructor.
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>::~DecisionStump()
-{
-  for (size_t i = 0; i < children.size(); ++i)
-    delete children[i];
-  children.clear();
+  split[0] = DBL_MAX;
+  binLabels[0] = 0;
 }
 
 /**
  * Train on the given data and labels.
  */
-template<typename MatType, bool NoRecursion>
-void DecisionStump<MatType, NoRecursion>::Train(
-    const MatType& data,
-    const arma::Row<size_t>& labels,
-    const size_t classes,
-    const size_t bucketSize)
+template<typename MatType>
+void DecisionStump<MatType>::Train(const MatType& data,
+                                   const arma::Row<size_t>& labels,
+                                   const size_t classes,
+                                   const size_t bucketSize)
 {
   this->classes = classes;
   this->bucketSize = bucketSize;
@@ -195,12 +78,11 @@ void DecisionStump<MatType, NoRecursion>::Train(
  * @param labels Labels for dataset.
  * @param UseWeights Whether we need to run a weighted Decision Stump.
  */
-template<typename MatType, bool NoRecursion>
+template<typename MatType>
 template<bool UseWeights>
-void DecisionStump<MatType, NoRecursion>::Train(
-    const MatType& data,
-    const arma::Row<size_t>& labels,
-    const arma::rowvec& weights)
+void DecisionStump<MatType>::Train(const MatType& data,
+                                   const arma::Row<size_t>& labels,
+                                   const arma::rowvec& weights)
 {
   // If classLabels are not all identical, proceed with training.
   size_t bestDim = 0;
@@ -230,10 +112,10 @@ void DecisionStump<MatType, NoRecursion>::Train(
       }
     }
   }
-  splitDimensionOrLabel = bestDim;
+  splitDimension = bestDim;
 
   // Once the splitting column/dimension has been decided, train on it.
-  TrainOnDim(data, splitDimensionOrLabel, labels);
+  TrainOnDim(data.row(splitDimension), labels);
 }
 
 /**
@@ -244,10 +126,9 @@ void DecisionStump<MatType, NoRecursion>::Train(
  * @param predictedLabels Vector to store the predicted classes after
  *      classifying test
  */
-template<typename MatType, bool NoRecursion>
-void DecisionStump<MatType, NoRecursion>::Classify(
-    const MatType& test,
-    arma::Row<size_t>& predictedLabels)
+template<typename MatType>
+void DecisionStump<MatType>::Classify(const MatType& test,
+                                      arma::Row<size_t>& predictedLabels)
 {
   predictedLabels.set_size(test.n_cols);
   for (size_t i = 0; i < test.n_cols; i++)
@@ -256,20 +137,17 @@ void DecisionStump<MatType, NoRecursion>::Classify(
     // Assume first that it falls into the first bin, then proceed through the
     // bins until it is known which bin it falls into.
     size_t bin = 0;
-    const double val = test(splitDimensionOrLabel, i);
+    const double val = test(splitDimension, i);
 
-    while (bin < splitOrClassProbs.n_elem - 1)
+    while (bin < split.n_elem - 1)
     {
-      if (val < splitOrClassProbs(bin + 1))
+      if (val < split(bin + 1))
         break;
 
       ++bin;
     }
 
-    if (NoRecursion)
-      predictedLabels(i) = children[bin]->Label();
-    else
-      children[bin]->Classify(test, predictedLabels);
+    predictedLabels(i) = binLabels(bin);
   }
 }
 
@@ -285,12 +163,11 @@ void DecisionStump<MatType, NoRecursion>::Classify(
  * @param labels The labels of data.
  * @param UseWeights Whether we need to run a weighted Decision Stump.
  */
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>::DecisionStump(
-    const DecisionStump<>& other,
-    const MatType& data,
-    const arma::Row<size_t>& labels,
-    const arma::rowvec& weights) :
+template<typename MatType>
+DecisionStump<MatType>::DecisionStump(const DecisionStump<>& other,
+                                      const MatType& data,
+                                      const arma::Row<size_t>& labels,
+                                      const arma::rowvec& weights) :
     classes(other.classes),
     bucketSize(other.bucketSize)
 {
@@ -300,11 +177,10 @@ DecisionStump<MatType, NoRecursion>::DecisionStump(
 /**
  * Serialize the decision stump.
  */
-template<typename MatType, bool NoRecursion>
+template<typename MatType>
 template<typename Archive>
-void DecisionStump<MatType, NoRecursion>::Serialize(
-    Archive& ar,
-    const unsigned int /* version */)
+void DecisionStump<MatType>::Serialize(Archive& ar,
+                                       const unsigned int /* version */)
 {
   using data::CreateNVP;
 
@@ -312,40 +188,9 @@ void DecisionStump<MatType, NoRecursion>::Serialize(
   // None need special handling.
   ar & CreateNVP(classes, "classes");
   ar & CreateNVP(bucketSize, "bucketSize");
-  ar & CreateNVP(splitDimensionOrLabel, "splitDimensionOrLabel");
-  ar & CreateNVP(splitOrClassProbs, "splitOrClassProbs");
-
-  size_t numChildren = children.size();
-  ar & CreateNVP(numChildren, "numChildren");
-  if (Archive::is_loading::value)
-  {
-    // Clear memory and prepare for loading children.
-    for (size_t i = 0; i < children.size(); ++i)
-      delete children[i];
-    children.clear();
-    children.resize(numChildren);
-  }
-
-  for (size_t i = 0; i < numChildren; ++i)
-  {
-    std::ostringstream oss;
-    oss << "child" << i;
-    ar & CreateNVP(children[i], oss.str());
-  }
-}
-
-/**
- * Create a leaf manually.
- */
-template<typename MatType, bool NoRecursion>
-DecisionStump<MatType, NoRecursion>::DecisionStump(const size_t bucketSize,
-                                                   const size_t label,
-                                                   arma::vec&& probabilities) :
-    bucketSize(bucketSize),
-    splitDimensionOrLabel(label),
-    splitOrClassProbs(std::move(probabilities))
-{
-  // Nothing else to do.
+  ar & CreateNVP(splitDimension, "splitDimension");
+  ar & CreateNVP(split, "split");
+  ar & CreateNVP(binLabels, "binLabels");
 }
 
 /**
@@ -356,9 +201,9 @@ DecisionStump<MatType, NoRecursion>::DecisionStump(const size_t bucketSize,
  *      the splitting dimension.
  * @param UseWeights Whether we need to run a weighted Decision Stump.
  */
-template<typename MatType, bool NoRecursion>
+template<typename MatType>
 template<bool UseWeights, typename VecType>
-double DecisionStump<MatType, NoRecursion>::SetupSplitDimension(
+double DecisionStump<MatType>::SetupSplitDimension(
     const VecType& dimension,
     const arma::Row<size_t>& labels,
     const arma::rowvec& weights)
@@ -446,31 +291,24 @@ double DecisionStump<MatType, NoRecursion>::SetupSplitDimension(
  * @param dimension Dimension is the dimension decided by the constructor on
  *      which we now train the decision stump.
  */
-template<typename MatType, bool NoRecursion>
-void DecisionStump<MatType, NoRecursion>::TrainOnDim(
-    const MatType& data,
-    const size_t dimension,
-    const arma::Row<size_t>& labels)
+template<typename MatType>
+template<typename VecType>
+void DecisionStump<MatType>::TrainOnDim(const VecType& dimension,
+                                        const arma::Row<size_t>& labels)
 {
   size_t i, count, begin, end;
 
-  typename MatType::row_type sortedSplitDim = arma::sort(data.row(dimension));
-  arma::uvec sortedSplitIndexDim =
-      arma::stable_sort_index(data.row(dimension).t());
-  arma::Row<size_t> sortedLabels(data.n_cols);
-  arma::Col<size_t> binLabels;
+  typename MatType::row_type sortedSplitDim = arma::sort(dimension);
+  arma::uvec sortedSplitIndexDim = arma::stable_sort_index(dimension.t());
+  arma::Row<size_t> sortedLabels(dimension.n_elem);
+  sortedLabels.fill(0);
 
-  for (i = 0; i < data.n_cols; i++)
+  for (i = 0; i < dimension.n_elem; i++)
     sortedLabels(i) = labels(sortedSplitIndexDim(i));
 
-  /**
-   * Loop through the points, splitting when it is advantageous.  We check to
-   * see if we can split at index i, and then if we can, the split will take the
-   * value that's the midpoint between index i and index i + 1.
-   */
   arma::rowvec subCols;
   double mostFreq;
-  i = bucketSize;
+  i = 0;
   count = 0;
   while (i < sortedLabels.n_elem)
   {
@@ -482,8 +320,8 @@ void DecisionStump<MatType, NoRecursion>::TrainOnDim(
 
       mostFreq = CountMostFreq(sortedLabels.cols(begin, end));
 
-      splitOrClassProbs.resize(splitOrClassProbs.n_elem + 1);
-      splitOrClassProbs(splitOrClassProbs.n_elem - 1) = sortedSplitDim(begin);
+      split.resize(split.n_elem + 1);
+      split(split.n_elem - 1) = sortedSplitDim(begin);
       binLabels.resize(binLabels.n_elem + 1);
       binLabels(binLabels.n_elem - 1) = mostFreq;
 
@@ -510,8 +348,8 @@ void DecisionStump<MatType, NoRecursion>::TrainOnDim(
       // the bucket of subCols.
       mostFreq = CountMostFreq(sortedLabels.cols(begin, end));
 
-      splitOrClassProbs.resize(splitOrClassProbs.n_elem + 1);
-      splitOrClassProbs(splitOrClassProbs.n_elem - 1) = sortedSplitDim(end + 1);
+      split.resize(split.n_elem + 1);
+      split(split.n_elem - 1) = sortedSplitDim(begin);
       binLabels.resize(binLabels.n_elem + 1);
       binLabels(binLabels.n_elem - 1) = mostFreq;
 
@@ -524,88 +362,32 @@ void DecisionStump<MatType, NoRecursion>::TrainOnDim(
 
   // Now trim the split matrix so that buckets one after the after which point
   // to the same classLabel are merged as one big bucket.
-  for (size_t i = 1; i < splitOrClassProbs.n_rows; i++)
+  MergeRanges();
+}
+
+/**
+ * After the "split" matrix has been set up, merge ranges with identical class
+ * labels.
+ */
+template<typename MatType>
+void DecisionStump<MatType>::MergeRanges()
+{
+  for (size_t i = 1; i < split.n_rows; i++)
   {
     if (binLabels(i) == binLabels(i - 1))
     {
       // Remove this row, as it has the same label as the previous bucket.
       binLabels.shed_row(i);
-      splitOrClassProbs.shed_row(i);
+      split.shed_row(i);
       // Go back to previous row.
       i--;
     }
   }
-
-  // Now create the children, either recursively (if we are not a tree) or not
-  // (if we are a stump).
-  if (NoRecursion)
-  {
-    size_t begin = 0;
-    for (size_t i = 0; i < splitOrClassProbs.n_elem; ++i)
-    {
-      // Calculate class probabilities for children.
-      arma::vec childClassProbs(classes);
-      childClassProbs.zeros();
-
-      size_t lastBegin = begin;
-      do
-      {
-        childClassProbs[sortedLabels[begin]]++;
-      } while (sortedSplitDim(++begin) < splitOrClassProbs[i]);
-
-      // Normalize probabilities.
-      childClassProbs /= (begin - lastBegin);
-
-      // Create child.
-      children.push_back(new DecisionStump(bucketSize, binLabels[i],
-          std::move(childClassProbs)));
-    }
-
-    // Create the last child.
-    arma::vec childClassProbs(classes);
-    childClassProbs.zeros();
-
-    size_t lastBegin = begin;
-    do
-    {
-      childClassProbs[sortedLabels[begin]]++;
-    } while (++begin < sortedSplitDim.n_elem);
-
-    // Normalize probabilities.
-    childClassProbs /= (begin - lastBegin);
-
-    // Create child.
-    children.push_back(new DecisionStump(bucketSize,
-        binLabels[binLabels.n_elem - 1], std::move(childClassProbs)));
-  }
-  else
-  {
-    // Do recursion.
-    size_t begin = 0;
-    for (size_t i = 0; i < splitOrClassProbs.n_elem; ++i)
-    {
-      // Determine how many points are in this child.
-      size_t lastBegin = begin;
-      while (sortedSplitDim(++begin) < splitOrClassProbs[i]) { }
-      size_t numPoints = (lastBegin - begin);
-
-      // Allocate memory for child data and fill it.
-      MatType childData(data.n_rows, numPoints);
-      for (size_t i = lastBegin; i < begin; ++i)
-        childData.col(i - lastBegin) = data.col(sortedSplitIndexDim[i]);
-      arma::Row<size_t> childLabels = sortedLabels.subvec(lastBegin, begin - 1);
-
-      // Create the child recursively.
-      children.push_back(new DecisionStump(childData, childLabels, classes,
-          bucketSize));
-    }
-  }
 }
 
-template<typename MatType, bool NoRecursion>
+template<typename MatType>
 template<typename VecType>
-double DecisionStump<MatType, NoRecursion>::CountMostFreq(
-    const VecType& subCols)
+double DecisionStump<MatType>::CountMostFreq(const VecType& subCols)
 {
   // We'll create a map of elements and the number of times that each element is
   // seen.
@@ -642,9 +424,9 @@ double DecisionStump<MatType, NoRecursion>::CountMostFreq(
  *
  * @param featureRow The dimension which is checked for identical values.
  */
-template<typename MatType, bool NoRecursion>
+template<typename MatType>
 template<typename VecType>
-int DecisionStump<MatType, NoRecursion>::IsDistinct(const VecType& featureRow)
+int DecisionStump<MatType>::IsDistinct(const VecType& featureRow)
 {
   typename VecType::elem_type val = featureRow(0);
   for (size_t i = 1; i < featureRow.n_elem; ++i)
@@ -659,9 +441,9 @@ int DecisionStump<MatType, NoRecursion>::IsDistinct(const VecType& featureRow)
  * @param labels Corresponding labels of the dimension.
  * @param UseWeights Whether we need to run a weighted Decision Stump.
  */
-template<typename MatType, bool NoRecursion>
+template<typename MatType>
 template<bool UseWeights, typename VecType, typename WeightVecType>
-double DecisionStump<MatType, NoRecursion>::CalculateEntropy(
+double DecisionStump<MatType>::CalculateEntropy(
     const VecType& labels,
     const WeightVecType& weights)
 {
diff --git a/src/mlpack/tests/adaboost_test.cpp b/src/mlpack/tests/adaboost_test.cpp
index b67f7de..542eb59 100644
--- a/src/mlpack/tests/adaboost_test.cpp
+++ b/src/mlpack/tests/adaboost_test.cpp
@@ -125,7 +125,7 @@ BOOST_AUTO_TEST_CASE(HammingLossBoundVertebralColumn)
     BOOST_FAIL("Cannot load test dataset vc2.csv!");
 
   arma::Mat<size_t> labels;
-  if (!data::Load("vc2_labels.txt", labels))
+  if (!data::Load("vc2_labels.txt",labels))
     BOOST_FAIL("Cannot load labels for vc2_labels.txt");
 
   // Define your own weak learner, perceptron in this case.
@@ -875,15 +875,10 @@ BOOST_AUTO_TEST_CASE(DecisionStumpSerializationTest)
                   abText.WeakLearner(i).Split(),
                   abBinary.WeakLearner(i).Split());
 
-    for (size_t j = 0; j < ab.WeakLearner(i).Split().n_elem + 1; ++j)
-    {
-      BOOST_REQUIRE_EQUAL(ab.WeakLearner(i).Child(j).Label(),
-                          abXml.WeakLearner(i).Child(j).Label());
-      BOOST_REQUIRE_EQUAL(ab.WeakLearner(i).Child(j).Label(),
-                          abText.WeakLearner(i).Child(j).Label());
-      BOOST_REQUIRE_EQUAL(ab.WeakLearner(i).Child(j).Label(),
-                          abBinary.WeakLearner(i).Child(j).Label());
-    }
+    CheckMatrices(ab.WeakLearner(i).BinLabels(),
+                  abXml.WeakLearner(i).BinLabels(),
+                  abText.WeakLearner(i).BinLabels(),
+                  abBinary.WeakLearner(i).BinLabels());
   }
 }
 
diff --git a/src/mlpack/tests/decision_stump_test.cpp b/src/mlpack/tests/decision_stump_test.cpp
index 21d1d71..af1e5f7 100644
--- a/src/mlpack/tests/decision_stump_test.cpp
+++ b/src/mlpack/tests/decision_stump_test.cpp
@@ -307,9 +307,9 @@ BOOST_AUTO_TEST_CASE(DimensionSelectionTest)
   for (size_t i = 0; i < ds.Split().n_elem; ++i)
   {
     if (ds.Split()[i] <= -3.0)
-      BOOST_CHECK_EQUAL(ds.Child(i).Label(), 0);
+      BOOST_CHECK_EQUAL(ds.BinLabels()[i], 0);
     else if (ds.Split()[i] >= 3.0)
-      BOOST_CHECK_EQUAL(ds.Child(i).Label(), 1);
+      BOOST_CHECK_EQUAL(ds.BinLabels()[i], 1);
   }
 }
 
@@ -359,226 +359,37 @@ BOOST_AUTO_TEST_CASE(EmptyConstructorTest)
 }
 
 /**
- * Test the copy constructor for a stump.
+ * Ensure that a matrix holding ints can be trained.  The bigger issue here is
+ * just compilation.
  */
-BOOST_AUTO_TEST_CASE(DecisionStumpCopyConstructorTest)
+BOOST_AUTO_TEST_CASE(IntTest)
 {
-  // This dataset comes from Chapter 6 of the book "Data Mining: Concepts,
-  // Models, Methods, and Algorithms" (2nd Edition) by Mehmed Kantardzic.  It is
-  // found on page 176 (and a description of the correct splitting dimension is
-  // given below that).
-  mat trainingData;
-  trainingData << 0  << 0  << 0  << 0  << 0  << 1  << 1  << 1  << 1
-               << 2  << 2  << 2  << 2  << 2  << endr
-               << 70 << 90 << 85 << 95 << 70 << 90 << 78 << 65 << 75
-               << 80 << 70 << 80 << 80 << 96 << endr
-               << 1  << 1  << 0  << 0  << 0  << 1  << 0  << 1  << 0
-               << 1  << 1  << 0  << 0  << 0  << endr;
-
-  // No need to normalize labels here.
-  Mat<size_t> labelsIn;
-  labelsIn << 0 << 1 << 1 << 1 << 0 << 0 << 0 << 0
-           << 0 << 1 << 1 << 0 << 0 << 0;
-
-  DecisionStump<> d(trainingData, labelsIn.row(0), 2);
-
-  // Make a copy.
-  DecisionStump<> copy(d);
-  DecisionStump<> copy2 = d;
-
-  // Check the objects for similarity.
-  BOOST_REQUIRE_EQUAL(d.Split().n_elem, copy.Split().n_elem);
-  BOOST_REQUIRE_EQUAL(d.Split().n_elem, copy2.Split().n_elem);
-  BOOST_REQUIRE_EQUAL(d.NumChildren(), copy.NumChildren());
-  BOOST_REQUIRE_EQUAL(d.NumChildren(), copy2.NumChildren());
-  for (size_t i = 0; i < d.NumChildren(); ++i)
-  {
-    BOOST_REQUIRE_EQUAL(d.Child(i).Label(), copy.Child(i).Label());
-    BOOST_REQUIRE_EQUAL(d.Child(i).Split().n_rows,
-        copy.Child(i).Split().n_rows);
-    BOOST_REQUIRE_EQUAL(d.Child(i).Split().n_cols,
-        copy.Child(i).Split().n_cols);
-    for (size_t j = 0; j < d.Child(i).Split().n_elem; ++j)
-    {
-      if (std::abs(d.Child(i).Split()[j]) < 1e-5)
-        BOOST_REQUIRE_SMALL(copy.Child(i).Split()[j], 1e-5);
-      else
-        BOOST_REQUIRE_CLOSE(copy.Child(i).Split()[j], d.Child(i).Split()[j],
-            1e-5);
-    }
-
-    BOOST_REQUIRE_EQUAL(d.Child(i).Label(), copy2.Child(i).Label());
-    BOOST_REQUIRE_EQUAL(d.Child(i).Split().n_rows,
-        copy2.Child(i).Split().n_rows);
-    BOOST_REQUIRE_EQUAL(d.Child(i).Split().n_cols,
-        copy2.Child(i).Split().n_cols);
-    for (size_t j = 0; j < d.Child(i).Split().n_elem; ++j)
-    {
-      if (std::abs(d.Child(i).Split()[j]) < 1e-5)
-        BOOST_REQUIRE_SMALL(copy2.Child(i).Split()[j], 1e-5);
-      else
-        BOOST_REQUIRE_CLOSE(copy2.Child(i).Split()[j], d.Child(i).Split()[j],
-            1e-5);
-    }
-  }
-}
-
-/**
- * Test the move constructor for a stump.
- */
-BOOST_AUTO_TEST_CASE(DecisionStumpMoveConstructorTest)
-{
-  // This dataset comes from Chapter 6 of the book "Data Mining: Concepts,
-  // Models, Methods, and Algorithms" (2nd Edition) by Mehmed Kantardzic.  It is
-  // found on page 176 (and a description of the correct splitting dimension is
-  // given below that).
-  mat trainingData;
-  trainingData << 0  << 0  << 0  << 0  << 0  << 1  << 1  << 1  << 1
-               << 2  << 2  << 2  << 2  << 2  << endr
-               << 70 << 90 << 85 << 95 << 70 << 90 << 78 << 65 << 75
-               << 80 << 70 << 80 << 80 << 96 << endr
-               << 1  << 1  << 0  << 0  << 0  << 1  << 0  << 1  << 0
-               << 1  << 1  << 0  << 0  << 0  << endr;
-
-  // No need to normalize labels here.
-  Mat<size_t> labelsIn;
-  labelsIn << 0 << 1 << 1 << 1 << 0 << 0 << 0 << 0
-           << 0 << 1 << 1 << 0 << 0 << 0;
-
-  DecisionStump<> d(trainingData, labelsIn.row(0), 2);
-  DecisionStump<> copy(d); // A copy to compare against.
-
-  DecisionStump<> move(std::move(d));
-  DecisionStump<> empty; // An empty object to compare against.
-
-  BOOST_REQUIRE_EQUAL(d.Split().n_elem, empty.Split().n_elem);
-  BOOST_REQUIRE_EQUAL(d.NumChildren(), empty.NumChildren());
-  for (size_t i = 0; i < d.NumChildren(); ++i)
-  {
-    BOOST_REQUIRE_EQUAL(d.Child(i).Label(), empty.Child(i).Label());
-    BOOST_REQUIRE_EQUAL(d.Child(i).Split().n_rows,
-        empty.Child(i).Split().n_rows);
-    BOOST_REQUIRE_EQUAL(d.Child(i).Split().n_cols,
-        empty.Child(i).Split().n_cols);
-    for (size_t j = 0; j < d.Child(i).Split().n_elem; ++j)
-    {
-      if (std::abs(d.Child(i).Split()[j]) < 1e-5)
-        BOOST_REQUIRE_SMALL(empty.Child(i).Split()[j], 1e-5);
-      else
-        BOOST_REQUIRE_CLOSE(empty.Child(i).Split()[j], d.Child(i).Split()[j],
-            1e-5);
-    }
-  }
-
-  BOOST_REQUIRE_EQUAL(move.Split().n_elem, copy.Split().n_elem);
-  BOOST_REQUIRE_EQUAL(move.NumChildren(), copy.NumChildren());
-  for (size_t i = 0; i < move.NumChildren(); ++i)
-  {
-    BOOST_REQUIRE_EQUAL(move.Child(i).Label(), copy.Child(i).Label());
-    BOOST_REQUIRE_EQUAL(move.Child(i).Split().n_rows,
-        copy.Child(i).Split().n_rows);
-    BOOST_REQUIRE_EQUAL(move.Child(i).Split().n_cols,
-        copy.Child(i).Split().n_cols);
-    for (size_t j = 0; j < move.Child(i).Split().n_elem; ++j)
-    {
-      if (std::abs(move.Child(i).Split()[j]) < 1e-5)
-        BOOST_REQUIRE_SMALL(copy.Child(i).Split()[j], 1e-5);
-      else
-        BOOST_REQUIRE_CLOSE(copy.Child(i).Split()[j], move.Child(i).Split()[j],
-            1e-5);
-    }
-  }
-}
-
-/**
- * Test the move operator.
- */
-BOOST_AUTO_TEST_CASE(DecisionStumpMoveOperatorTest)
-{
-  // This dataset comes from Chapter 6 of the book "Data Mining: Concepts,
-  // Models, Methods, and Algorithms" (2nd Edition) by Mehmed Kantardzic.  It is
-  // found on page 176 (and a description of the correct splitting dimension is
-  // given below that).
-  mat trainingData;
-  trainingData << 0  << 0  << 0  << 0  << 0  << 1  << 1  << 1  << 1
-               << 2  << 2  << 2  << 2  << 2  << endr
-               << 70 << 90 << 85 << 95 << 70 << 90 << 78 << 65 << 75
-               << 80 << 70 << 80 << 80 << 96 << endr
-               << 1  << 1  << 0  << 0  << 0  << 1  << 0  << 1  << 0
-               << 1  << 1  << 0  << 0  << 0  << endr;
+  // Train on a dataset and make sure something kind of makes sense.
+  imat trainingData;
+  trainingData << -7 << -6 << -5 << -4 << -3 << -2 << -1 << 0 << 1
+               << 2  << 3  << 4  << 5  << 6  << 7  << 8  << 9 << 10;
 
   // No need to normalize labels here.
   Mat<size_t> labelsIn;
-  labelsIn << 0 << 1 << 1 << 1 << 0 << 0 << 0 << 0
-           << 0 << 1 << 1 << 0 << 0 << 0;
-
-  DecisionStump<> d(trainingData, labelsIn.row(0), 2);
-  DecisionStump<> copy(d); // A copy to compare against.
-
-  DecisionStump<> move = std::move(d);
-  DecisionStump<> empty; // An empty object to compare against.
-
-  BOOST_REQUIRE_EQUAL(d.Split().n_elem, empty.Split().n_elem);
-  BOOST_REQUIRE_EQUAL(d.NumChildren(), empty.NumChildren());
-  for (size_t i = 0; i < d.NumChildren(); ++i)
-  {
-    BOOST_REQUIRE_EQUAL(d.Child(i).Label(), empty.Child(i).Label());
-    BOOST_REQUIRE_EQUAL(d.Child(i).Split().n_rows,
-        empty.Child(i).Split().n_rows);
-    BOOST_REQUIRE_EQUAL(d.Child(i).Split().n_cols,
-        empty.Child(i).Split().n_cols);
-    for (size_t j = 0; j < d.Child(i).Split().n_elem; ++j)
-    {
-      if (std::abs(d.Child(i).Split()[j]) < 1e-5)
-        BOOST_REQUIRE_SMALL(empty.Child(i).Split()[j], 1e-5);
-      else
-        BOOST_REQUIRE_CLOSE(empty.Child(i).Split()[j], d.Child(i).Split()[j],
-            1e-5);
-    }
-  }
-
-  BOOST_REQUIRE_EQUAL(move.Split().n_elem, copy.Split().n_elem);
-  BOOST_REQUIRE_EQUAL(move.NumChildren(), copy.NumChildren());
-  for (size_t i = 0; i < move.NumChildren(); ++i)
-  {
-    BOOST_REQUIRE_EQUAL(move.Child(i).Label(), copy.Child(i).Label());
-    BOOST_REQUIRE_EQUAL(move.Child(i).Split().n_rows,
-        copy.Child(i).Split().n_rows);
-    BOOST_REQUIRE_EQUAL(move.Child(i).Split().n_cols,
-        copy.Child(i).Split().n_cols);
-    for (size_t j = 0; j < move.Child(i).Split().n_elem; ++j)
-    {
-      if (std::abs(move.Child(i).Split()[j]) < 1e-5)
-        BOOST_REQUIRE_SMALL(copy.Child(i).Split()[j], 1e-5);
-      else
-        BOOST_REQUIRE_CLOSE(copy.Child(i).Split()[j], move.Child(i).Split()[j],
-            1e-5);
-    }
-  }
-}
-
-/**
- * Test that the decision tree can be reasonably built.
- */
-BOOST_AUTO_TEST_CASE(DecisionTreeBuildTest)
-{
-  arma::mat inputData;
-  if (!data::Load("vc2.csv", inputData))
-    BOOST_FAIL("Cannot load test dataset vc2.csv!");
+  labelsIn << 0 << 0 << 0 << 0 << 1 << 1 << 0 << 0
+           << 1 << 1 << 1 << 2 << 1 << 2 << 2 << 2 << 2 << 2;
 
-  arma::Mat<size_t> labels;
-  if (!data::Load("vc2_labels.txt", labels))
-    BOOST_FAIL("Cannot load labels for vc2_labels.txt");
+  DecisionStump<arma::imat> ds(trainingData, labelsIn.row(0), 4, 3);
 
-  // Construct a full decision tree.
-  DecisionStump<arma::mat, false> tree(inputData, labels.row(0), 3);
+  imat testingData;
+  testingData << -6 << -6 << -2 << -1 << 3 << 5 << 7 << 9;
 
-  // Ensure that it has some children.
-  BOOST_REQUIRE_GT(tree.NumChildren(), 0);
+  arma::Row<size_t> predictedLabels;
+  ds.Classify(testingData, predictedLabels);
 
-  // Ensure that its children have some children.
-  for (size_t i = 0; i < tree.NumChildren(); ++i)
-    BOOST_REQUIRE_GT(tree.Child(i).NumChildren(), 0);
+  BOOST_CHECK_EQUAL(predictedLabels(0, 0), 0);
+  BOOST_CHECK_EQUAL(predictedLabels(0, 1), 0);
+  BOOST_CHECK_EQUAL(predictedLabels(0, 2), 1);
+  BOOST_CHECK_EQUAL(predictedLabels(0, 3), 1);
+  BOOST_CHECK_EQUAL(predictedLabels(0, 4), 1);
+  BOOST_CHECK_EQUAL(predictedLabels(0, 5), 1);
+  BOOST_CHECK_EQUAL(predictedLabels(0, 6), 2);
+  BOOST_CHECK_EQUAL(predictedLabels(0, 7), 2);
 }
 
 BOOST_AUTO_TEST_SUITE_END();
diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp
index 2996f23..fd9925b 100644
--- a/src/mlpack/tests/serialization_test.cpp
+++ b/src/mlpack/tests/serialization_test.cpp
@@ -1274,12 +1274,8 @@ BOOST_AUTO_TEST_CASE(DecisionStumpTest)
   BOOST_REQUIRE_EQUAL(ds.SplitDimension(), binaryDs.SplitDimension());
 
   CheckMatrices(ds.Split(), xmlDs.Split(), textDs.Split(), binaryDs.Split());
-  for (size_t i = 0; i < ds.Split().n_elem; ++i)
-  {
-    BOOST_REQUIRE_EQUAL(ds.Child(i).Label(), xmlDs.Child(i).Label());
-    BOOST_REQUIRE_EQUAL(ds.Child(i).Label(), textDs.Child(i).Label());
-    BOOST_REQUIRE_EQUAL(ds.Child(i).Label(), binaryDs.Child(i).Label());
-  }
+  CheckMatrices(ds.BinLabels(), xmlDs.BinLabels(), textDs.BinLabels(),
+      binaryDs.BinLabels());
 }
 
 // Make sure serialization works for LARS.

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list