[mlpack] 86/324: Remove oneClass and defaultClass variables. There is a shortcut that can be taken when all the labels are the same, but the Entropy() function does not appear to be working correctly.

Barak A. Pearlmutter barak+git at cs.nuim.ie
Sun Aug 17 08:21:58 UTC 2014


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch svn-trunk
in repository mlpack.

commit fbbff229a07405f57e7b5f3dd3cb2fa2cbe00dc1
Author: rcurtin <rcurtin at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date:   Thu Jun 26 23:21:07 2014 +0000

    Remove oneClass and defaultClass variables.  There is a shortcut that can be taken when all the labels are the same, but the Entropy() function does not appear to be working correctly.
    
    
    git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@16724 9d5b8971-822b-0410-80eb-d18c1038ef23
---
 .../methods/decision_stump/decision_stump.hpp      |   6 --
 .../methods/decision_stump/decision_stump_impl.hpp | 113 ++++++++++-----------
 2 files changed, 56 insertions(+), 63 deletions(-)

diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index 9b16a39..fb7515d 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -49,15 +49,9 @@ class DecisionStump
   //! Stores the number of classes.
   size_t numClass;
 
-  //! Stores the default class. Provided for handling missing attribute values.
-  size_t defaultClass;
-
   //! Stores the value of the attribute on which to split.
   int splitCol;
 
-  //! Flag value for distinct input class labels.
-  bool oneClass;
-
   //! Size of bucket while determining splitting criterion.
   size_t bucketSize;
 
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index 5f6bf8c..625e12e 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -34,48 +34,55 @@ DecisionStump<MatType>::DecisionStump(const MatType& data,
   numClass = classes;
   bucketSize = inpBucketSize;
 
-  // Check whether the input labels are not all identical.
-  if (!isDistinct<size_t>(labels))
-  {
-    // If the labels are all identical, the default class is the only class.
-    oneClass = true;
-    defaultClass = labels(0);
-  }
-  else
-  {
-    // If labels are not all identical, proceed with training.
-    oneClass = false;
-    int bestAtt = -1;
-    double entropy;
-    double bestEntropy = DBL_MAX;
+  // If classLabels are not all identical, proceed with training.
+  int bestAtt = -1;
+  double entropy;
+  double bestEntropy = DBL_MAX;
 
-    // Set the default class to handle attribute values which are not present in
-    // the training data.
-    defaultClass = CountMostFreq<size_t>(labels);
+  // Set the default class to handle attribute values which are not present in
+  // the training data.
+  //defaultClass = CountMostFreq<size_t>(classLabels);
 
-    for (int i = 0; i < data.n_rows; i++)
+  for (int i = 0; i < data.n_rows; i++)
+  {
+    // Go through each attribute of the data.
+    if (isDistinct<double>(data.row(i)))
     {
-      // Go through each attribute of the data.
-      if (isDistinct<double>(data.row(i)))
+      // For each attribute with non-identical values, treat it as a potential
+      // splitting attribute and calculate entropy if split on it.
+      entropy = SetupSplitAttribute(data.row(i), labels);
+
+      // Find the attribute with the bestEntropy so that the gain is
+      // maximized.
+      if (entropy < bestEntropy)
       {
-        // For each attribute with non-identical values, treat it as a potential
-        // splitting attribute and calculate entropy if split on it.
-        entropy = SetupSplitAttribute(data.row(i), labels);
-
-        // Find the attribute with the bestEntropy so that the gain is
-        // maximized.
-        if (entropy < bestEntropy)
-        {
-          bestAtt = i;
-          bestEntropy = entropy;
-        }
+        bestAtt = i;
+        bestEntropy = entropy;
       }
-    }
-    splitCol = bestAtt;
 
-    // Once the splitting column/attribute has been decided, train on it.
-    TrainOnAtt<double>(data.row(splitCol), labels);
+      /* This section is commented out because I believe entropy calculation is
+       * wrong.  Entropy should only be 0 if there is only one class, in which
+       * case classification is perfect and we can take the shortcut below.
+
+      // If the entropy is 0, then all the labels are the same and we are done.
+      Log::Debug << "Entropy is " << entropy << "\n";
+      if (entropy == 0)
+      {
+        // Only one split element... there is no split at all, just one bin.
+        split.set_size(1);
+        binLabels.set_size(1);
+        split[0] = -DBL_MAX;
+        binLabels[0] = labels[0];
+        splitCol = 0; // It doesn't matter.
+        return;
+      }
+      */
+    }
   }
+  splitCol = bestAtt;
+
+  // Once the splitting column/attribute has been decided, train on it.
+  TrainOnAtt<double>(data.row(splitCol), labels);
 }
 
 /**
@@ -90,31 +97,23 @@ template<typename MatType>
 void DecisionStump<MatType>::Classify(const MatType& test,
                                       arma::Row<size_t>& predictedLabels)
 {
-  if (!oneClass)
+  for (int i = 0; i < test.n_cols; i++)
   {
-    for (int i = 0; i < test.n_cols; i++)
-    {
-      // Determine which bin the test point falls into.
-      // Assume first that it falls into the first bin, then proceed through the
-      // bins until it is known which bin it falls into.
-      int bin = 0;
-      const double val = test(splitCol, i);
+    // Determine which bin the test point falls into.
+    // Assume first that it falls into the first bin, then proceed through the
+    // bins until it is known which bin it falls into.
+    int bin = 0;
+    const double val = test(splitCol, i);
 
-      while (bin < split.n_elem - 1)
-      {
-        if (val < split(bin + 1))
-          break;
-
-        ++bin;
-      }
+    while (bin < split.n_elem - 1)
+    {
+      if (val < split(bin + 1))
+        break;
 
-      predictedLabels(i) = binLabels(bin);
+      ++bin;
     }
-  }
-  else
-  {
-    for (int i = 0; i < test.n_cols; i++)
-      predictedLabels(i) = defaultClass;
+
+    predictedLabels(i) = binLabels(bin);
   }
 }
 
@@ -408,7 +407,7 @@ double DecisionStump<MatType>::CalculateEntropy(const arma::rowvec& attribute,
     {
       if (uniqueAtt[j] == attribute[i])
       {
-        entropyArray(j,labels(i))++;
+        entropyArray(j, labels(i))++;
         numElem(j)++;
       }
     }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list