[mlpack] 121/324: New test added. Improved entropy calculation.

Sun Aug 17 08:22:02 UTC 2014

This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch svn-trunk
in repository mlpack.

commit 25bb2e692714985a0a2af2c255be1804d2324a71
Author: saxena.udit <saxena.udit at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date:   Thu Jul 3 18:39:04 2014 +0000

    New test added. Improved entropy calculation.
    
    git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@16759 9d5b8971-822b-0410-80eb-d18c1038ef23
---
 .../methods/decision_stump/decision_stump.hpp      |  6 +-
 .../methods/decision_stump/decision_stump_impl.hpp | 64 ++++++++--------------
 src/mlpack/tests/decision_stump_test.cpp           | 32 ++++++++++-
 3 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index 689d7d3..3f90729 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -45,12 +45,13 @@ class DecisionStump
    */
   void Classify(const MatType& test, arma::Row<size_t>& predictedLabels);
 
+  int splitCol;
  private:
   //! Stores the number of classes.
   size_t numClass;
 
   //! Stores the value of the attribute on which to split.
-  int splitCol;
+  // int splitCol;
 
   //! Size of bucket while determining splitting criterion.
   size_t bucketSize;
@@ -109,8 +110,7 @@ class DecisionStump
    * @param labels Corresponding labels of the attribute.
    */
   template <typename AttType, typename LabelType>
-  double CalculateEntropy(arma::subview_row<AttType> attribute,
-                          arma::subview_row<LabelType> labels);
+  double CalculateEntropy(arma::subview_row<LabelType> labels);
 };
 
 }; // namespace decision_stump
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index b9c58a5..bdf531c 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -147,21 +147,23 @@ double DecisionStump<MatType>::SetupSplitAttribute(
 
   i = 0;
   count = 0;
-
+  double ratioEl;
   // This splits the sorted into buckets of size greater than or equal to
   // inpBucketSize.
   while (i < sortedLabels.n_elem)
   {
     count++;
-    if (i == sortedLabels.n_elem - 1)
+    if (i == sortedLabels.n_elem - 1) 
     {
       // if we're at the end, then don't worry about the bucket size
       // just take this as the last bin.
       begin = i - count + 1;
       end = i;
-
-      entropy += CalculateEntropy<double, size_t>(
-                 sortedAtt.subvec(begin,end),sortedLabels.subvec(begin,end));
+      
+      // using ratioEl to calculate the ratio of elements in this split.
+      ratioEl = ((double)(end - begin + 1)/sortedLabels.n_elem);
+      
+      entropy += ratioEl * CalculateEntropy<size_t>(sortedLabels.subvec(begin,end));
       i++;
     }
     else if (sortedLabels(i) != sortedLabels(i + 1))
@@ -171,6 +173,8 @@ double DecisionStump<MatType>::SetupSplitAttribute(
       if (count < bucketSize)
       {
         // if it is, then take the minimum bucket size anyways
+        // this is where the inpBucketSize comes into use
+        // This makes sure there isn't a bucket for every change in labels.
         begin = i - count + 1;
         end = begin + bucketSize - 1;
 
@@ -183,9 +187,9 @@ double DecisionStump<MatType>::SetupSplitAttribute(
         begin = i - count + 1;
         end = i;
       }
-
-      entropy += CalculateEntropy<double, size_t>(
-                 sortedAtt.subvec(begin,end),sortedLabels.subvec(begin,end));
+      ratioEl = ((double)(end - begin + 1)/sortedLabels.n_elem);
+    
+      entropy +=ratioEl * CalculateEntropy<size_t>(sortedLabels.subvec(begin,end));
 
       i = end + 1;
       count = 0;
@@ -269,7 +273,7 @@ void DecisionStump<MatType>::TrainOnAtt(const arma::rowvec& attribute,
 
       // Find the most frequent element in subCols so as to assign a label to
       // the bucket of subCols.
-      mostFreq = CountMostFreq<double>(subCols);//sortedLabels.subvec(begin, end));
+      mostFreq = CountMostFreq<double>(subCols);
 
       split.resize(split.n_elem + 1);
       split(split.n_elem - 1) = sortedSplitAtt(begin);
@@ -372,45 +376,25 @@ int DecisionStump<MatType>::isDistinct(const arma::Row<rType>& featureRow)
  */
 template<typename MatType>
 template<typename AttType, typename LabelType>
-double DecisionStump<MatType>::CalculateEntropy(arma::subview_row<AttType> attribute,
-                                                arma::subview_row<LabelType> labels)
+double DecisionStump<MatType>::CalculateEntropy(arma::subview_row<LabelType> labels)
 {
   double entropy = 0.0;
-
-  arma::rowvec uniqueAtt = arma::unique(attribute);
-  arma::Row<LabelType> uniqueLabel = arma::unique(labels);
-  arma::Row<size_t> numElem(uniqueAtt.n_elem);
+  size_t j;
+  
+  arma::Row<size_t> numElem(numClass); 
   numElem.fill(0);
-  arma::Mat<size_t> entropyArray(uniqueAtt.n_elem,numClass);
-  entropyArray.fill(0);
 
-  // Populate entropyArray and numElem; they are used as helpers to calculate
+  // Populate numElem; they are used as helpers to calculate
   // entropy.
-  for (int j = 0; j < uniqueAtt.n_elem; j++)
-  {
-    for (int i = 0; i < attribute.n_elem; i++)
-    {
-      if (uniqueAtt[j] == attribute[i])
-      {
-        entropyArray(j, labels(i))++;
-        numElem(j)++;
-      }
-    }
-  }
+  for (j = 0; j < labels.n_elem; j++)
+    numElem(labels(j))++;
 
-  for (int j = 0; j < uniqueAtt.size(); j++)
+  for (j = 0; j < numClass; j++)
   {
-    const double p1 = ((double) numElem(j) / attribute.n_elem);
-
-    for (int i = 0; i < numClass; i++)
-    {
-      const double p2 = ((double) entropyArray(j, i) / numElem(j));
-      const double p3 = (p2 == 0) ? 0 : p2 * log2(p2);
-
-      entropy += p1 * p3;
-    }
+    const double p1 = ((double) numElem(j) / labels.n_elem);
+  
+    entropy += (p1 == 0) ? 0 : p1 * log2(p1);
   }
-
   return entropy;
 }
 
diff --git a/src/mlpack/tests/decision_stump_test.cpp b/src/mlpack/tests/decision_stump_test.cpp
index efb55d5..04fbf41 100644
--- a/src/mlpack/tests/decision_stump_test.cpp
+++ b/src/mlpack/tests/decision_stump_test.cpp
@@ -6,7 +6,7 @@
  */
 #include <mlpack/core.hpp>
 #include <mlpack/methods/decision_stump/decision_stump.hpp>
-
+ 
 #include <boost/test/unit_test.hpp>
 #include "old_boost_test_definitions.hpp"
 
@@ -47,6 +47,36 @@ BOOST_AUTO_TEST_CASE(OneClass)
     BOOST_CHECK_EQUAL(predictedLabels(i), 1);
 
 }
+/*
+This tests whether the entropy is being correctly calculated by
+checking the correct value of the splitting column value. 
+This test is for an inpBucketSize of 4 and the correct value of 
+the splitCol is 1. 
+*/
+BOOST_AUTO_TEST_CASE(CorrectAttributeChosen)
+{
+  const size_t numClasses = 2;
+  const size_t inpBucketSize = 4;
+
+  mat trainingData;
+  trainingData << 0 << 0 << 0 << 0 << 0 << 1 << 1 << 1 << 1
+               << 2  << 2  << 2  << 2  << 2 << endr
+               << 70 << 90 << 85 << 95 << 70 << 90 << 78 << 65 << 75
+               << 80  << 70  << 80  << 80  << 96 << endr
+               << 1 << 1 << 0 << 0 << 0 << 1 << 0 << 1 << 0
+               << 1  << 1  << 0  << 0  << 0 << endr;
+
+  // No need to normalize labels here.
+  Mat<size_t> labelsIn;
+  labelsIn << 0 << 1 << 1 << 1 << 0 << 0 << 0 << 0
+           << 0 << 1 << 1 << 0 << 0 << 0;
+
+  DecisionStump<> ds(trainingData, labelsIn.row(0), numClasses, inpBucketSize);
+
+  // Only need to check the value of the splitting column, no need of classification.
+
+  BOOST_CHECK_EQUAL(ds.splitCol,1);
+}
 
 /**
  * This tests for the classification:

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git