[mlpack] 89/324: Adding new cosine_tree code.

Barak A. Pearlmutter barak+git at cs.nuim.ie
Sun Aug 17 08:21:59 UTC 2014


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch svn-trunk
in repository mlpack.

commit fee79566328de0cb6b7ac1fc40bc7e98fe5890f2
Author: siddharth.950 <siddharth.950 at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date:   Fri Jun 27 08:28:07 2014 +0000

    Adding new cosine_tree code.
    
    git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@16727 9d5b8971-822b-0410-80eb-d18c1038ef23
---
 CMakeLists.txt                                     |  24 +--
 src/mlpack/core/tree/CMakeLists.txt                |   4 +-
 src/mlpack/core/tree/cosine_tree/cosine_node.hpp   | 178 ++++++++++++++++
 .../core/tree/cosine_tree/cosine_node_impl.hpp     | 230 +++++++++++++++++++++
 src/mlpack/core/tree/cosine_tree/cosine_tree.hpp   | 103 +++++++++
 .../core/tree/cosine_tree/cosine_tree_impl.hpp     | 222 ++++++++++++++++++++
 src/mlpack/tests/CMakeLists.txt                    |   1 +
 src/mlpack/tests/CMakeLists.txt~                   |   2 +
 src/mlpack/tests/cosine_tree_test.cpp              | 186 +++++++++++++++++
 src/mlpack/tests/tree_test.cpp                     | 194 -----------------
 10 files changed, 926 insertions(+), 218 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5895e7e..fe606d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,36 +170,16 @@ include_directories(${LIBXML2_INCLUDE_DIR})
 # Unfortunately this configuration variable is necessary and will need to be
 # updated as time goes on and new versions are released.
 set(Boost_ADDITIONAL_VERSIONS
-  "1.41" "1.41.0" "1.42" "1.42.0" "1.43" "1.43.0" "1.44" "1.44.0" "1.45.0"
-  "1.46.0" "1.46.1" "1.47.0" "1.48.0" "1.49.0")
+  "1.49.0" "1.50.0" "1.51.0" "1.52.0" "1.53.0" "1.54.0" "1.55.0")
 find_package(Boost
     COMPONENTS
       program_options
       unit_test_framework
+      random
     REQUIRED
 )
 include_directories(${Boost_INCLUDE_DIRS})
 
-# Save the actual link paths (because they will get overwritten if we discover
-# we need to find Boost.Random too).
-set(Boost_BACKUP_LIBRARIES ${Boost_LIBRARIES})
-
-# We need to include Boost.Random, but only if newer than 1.45 (as of 1.46 it
-# became a separate package with its own linkable library object).
-if(Boost_MAJOR_VERSION EQUAL 1 AND Boost_MINOR_VERSION GREATER 45)
-  find_package(Boost
-      COMPONENTS
-          random
-      REQUIRED
-  )
-
-  # Restore actual link locations of the other Boost libraries.
-  set(Boost_LIBRARIES ${Boost_LIBRARIES} ${Boost_BACKUP_LIBRARIES})
-
-  # This may be redundant.
-  include_directories(${Boost_INCLUDE_DIRS})
-
-endif(Boost_MAJOR_VERSION EQUAL 1 AND Boost_MINOR_VERSION GREATER 45)
 link_directories(${Boost_LIBRARY_DIRS})
 
 # On Windows, automatic linking is performed, so we don't need to worry about
diff --git a/src/mlpack/core/tree/CMakeLists.txt b/src/mlpack/core/tree/CMakeLists.txt
index 2f61c29..5003336 100644
--- a/src/mlpack/core/tree/CMakeLists.txt
+++ b/src/mlpack/core/tree/CMakeLists.txt
@@ -13,10 +13,10 @@ set(SOURCES
   binary_space_tree/single_tree_traverser_impl.hpp
   binary_space_tree/traits.hpp
   bounds.hpp
+  cosine_tree/cosine_node.hpp
+  cosine_tree/cosine_node_impl.hpp
   cosine_tree/cosine_tree_impl.hpp
   cosine_tree/cosine_tree.hpp
-  cosine_tree/cosine_tree_builder.hpp
-  cosine_tree/cosine_tree_builder_impl.hpp
   cover_tree/cover_tree.hpp
   cover_tree/cover_tree_impl.hpp
   cover_tree/first_point_is_root.hpp
diff --git a/src/mlpack/core/tree/cosine_tree/cosine_node.hpp b/src/mlpack/core/tree/cosine_tree/cosine_node.hpp
new file mode 100644
index 0000000..2f69dbe
--- /dev/null
+++ b/src/mlpack/core/tree/cosine_tree/cosine_node.hpp
@@ -0,0 +1,178 @@
+/**
+ * @file cosine_node.hpp
+ * @author Siddharth Agrawal
+ *
+ * Definition of Cosine Node.
+ */
+ 
+#ifndef __MLPACK_CORE_TREE_COSINE_TREE_COSINE_NODE_HPP
+#define __MLPACK_CORE_TREE_COSINE_TREE_COSINE_NODE_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace tree {
+
+class CosineNode
+{
+ public:
+ 
+  /**
+   * CosineNode constructor for the root node of the tree. It initializes the
+   * necessary variables required for splitting of the node, and building the
+   * tree further. It takes a pointer to the input matrix and calculates the
+   * relevant variables using it.
+   *
+   * @param dataset Matrix for which cosine tree is constructed.
+   */
+  CosineNode(const arma::mat& dataset);
+  
+  /**
+   * CosineNode constructor for nodes other than the root node of the tree. It
+   * takes in a pointer to the parent node and a list of column indices which
+   * mentions the columns to be included in the node. The function calculate the
+   * relevant variables just like the constructor above.
+   *
+   * @param parentNode Pointer to the parent CosineNode.
+   * @param subIndices Pointer to vector of column indices to be included.
+   */
+  CosineNode(CosineNode& parentNode, const std::vector<size_t>& subIndices);
+  
+  /**
+   * This function splits the CosineNode into two children based on the cosines
+   * of the columns contained in the node, with respect to the sampled splitting
+   * point. The function also calls the CosineNode constructor for the children.
+   */
+  void CosineNodeSplit();
+  
+  /**
+   * Sample 'numSamples' points from the Length-Squared distribution of the
+   * CosineNode. The function uses 'l2NormsSquared' to calculate the cumulative
+   * probability distribution of the column vectors. The sampling is based on a
+   * randomly generated values in the range [0, 1].
+   */
+  void ColumnSamplesLS(std::vector<size_t>& sampledIndices, 
+                       arma::vec& probabilities, size_t numSamples);
+  
+  /**
+   * Sample a point from the Length-Squared distribution of the CosineNode. The
+   * function uses 'l2NormsSquared' to calculate the cumulative probability
+   * distribution of the column vectors. The sampling is based on a randomly
+   * generated value in the range [0, 1].
+   */
+  size_t ColumnSampleLS();
+  
+  /**
+   * Sample a column based on the cumulative Length-Squared distribution of the
+   * CosineNode, and a randomly generated value in the range [0, 1]. Binary
+   * search is more efficient than searching linearly for the same. This leads
+   * a significant speedup when there are large number of columns to choose from
+   * and when a number of samples are to be drawn from the distribution.
+   *
+   * @param cDistribution Cumulative LS distibution of columns in the node.
+   * @param value Randomly generated value in the range [0, 1].
+   * @param start Starting index of the distribution interval to search in.
+   * @param end Ending index of the distribution interval to search in.
+   */
+  size_t BinarySearch(arma::vec& cDistribution, double value, size_t start,
+                      size_t end);
+  
+  /**
+   * Calculate cosines of the columns present in the node, with respect to the
+   * sampled splitting point. The calculated cosine values are useful for
+   * splitting the node into its children.
+   *
+   * @param cosines Vector to store the cosine values in.
+   */
+  void CalculateCosines(arma::vec& cosines);
+  
+  /**
+   * Calculate centroid of the columns present in the node. The calculated
+   * centroid is used as a basis vector for the cosine tree being constructed.
+   */
+  void CalculateCentroid();
+  
+  //! Get pointer to the dataset matrix.
+  const arma::mat& GetDataset() const { return dataset; }
+  
+  //! Get the indices of columns in the node.
+  std::vector<size_t>& VectorIndices() { return indices; }
+  
+  //! Set the Monte Carlo error.
+  void L2Error(const double error) { this->l2Error = error; }
+  
+  //! Get the Monte Carlo error.
+  double L2Error() const { return l2Error; }
+  
+  //! Get pointer to the centroid vector.
+  arma::vec& Centroid() { return centroid; }
+  
+  //! Set the basis vector of the node.
+  void BasisVector(arma::vec& bVector) { this->basisVector = bVector; }
+  
+  //! Get the basis vector of the node.
+  arma::vec& BasisVector() { return basisVector; }
+  
+  //! Get pointer to the left child of the node.
+  CosineNode* Left() { return left; }
+  
+  //! Get pointer to the right child of the node.
+  CosineNode* Right() { return right; }
+  
+  //! Get number of columns of input matrix in the node.
+  size_t NumColumns() const { return numColumns; }
+  
+  //! Get the Frobenius norm squared of columns in the node.
+  double FrobNormSquared() const { return frobNormSquared; }
+  
+  //! Get the column index of split point of the node.
+  size_t SplitPointIndex() const { return indices[splitPointIndex]; }
+ 
+ private:
+  //! Matrix for which cosine tree is constructed.
+  const arma::mat& dataset;
+  //! Parent of the node.
+  CosineNode* parent;
+  //! Right child of the node.
+  CosineNode* right;
+  //! Left child of the node.
+  CosineNode* left;
+  //! Indices of columns of input matrix in the node.
+  std::vector<size_t> indices;
+  //! L2-norm squared of columns in the node.
+  arma::vec l2NormsSquared;
+  //! Centroid of columns of input matrix in the node.
+  arma::vec centroid;
+  //! Orthonormalized basis vector of the node.
+  arma::vec basisVector;
+  //! Index of split point of cosine node.
+  size_t splitPointIndex;
+  //! Number of columns of input matrix in the node.
+  size_t numColumns;
+  //! Monte Carlo error for this node.
+  double l2Error;
+  //! Frobenius norm squared of columns in the node.
+  double frobNormSquared;
+  
+  // Friend class to facilitate construction of priority queue.
+  friend class CompareCosineNode;
+};
+
+class CompareCosineNode
+{
+ public:
+ 
+  // Comparison function for construction of priority queue.
+  bool operator() (const CosineNode* a, const CosineNode* b) const
+  {
+    return a->l2Error < b->l2Error;
+  }
+};
+
+}; // namespace tree
+}; // namespace mlpack
+
+// Include implementation.
+#include "cosine_node_impl.hpp"
+
+#endif
diff --git a/src/mlpack/core/tree/cosine_tree/cosine_node_impl.hpp b/src/mlpack/core/tree/cosine_tree/cosine_node_impl.hpp
new file mode 100644
index 0000000..fb92665
--- /dev/null
+++ b/src/mlpack/core/tree/cosine_tree/cosine_node_impl.hpp
@@ -0,0 +1,230 @@
+/**
+ * @file cosine_node_impl.hpp
+ * @author Siddharth Agrawal
+ *
+ * Implementation of cosine node.
+ */
+#ifndef __MLPACK_CORE_TREE_COSINE_TREE_COSINE_NODE_IMPL_HPP
+#define __MLPACK_CORE_TREE_COSINE_TREE_COSINE_NODE_IMPL_HPP
+
+// In case it wasn't included already for some reason.
+#include "cosine_node.hpp"
+
+namespace mlpack {
+namespace tree {
+
+CosineNode::CosineNode(const arma::mat& dataset) :
+    dataset(dataset),
+    parent(NULL),
+    right(NULL),
+    left(NULL),
+    numColumns(dataset.n_cols)
+{  
+  // Initialize sizes of column indices and l2 norms.
+  indices.resize(numColumns);
+  l2NormsSquared.zeros(numColumns);
+  
+  // Set indices and calculate squared norms of the columns.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    indices[i] = i;
+    double l2Norm = arma::norm(dataset.col(i), 2);
+    l2NormsSquared(i) = l2Norm * l2Norm;
+  }
+  
+  // Frobenius norm of columns in the node.
+  frobNormSquared = arma::accu(l2NormsSquared);
+  
+  // Calculate centroid of columns in the node.
+  CalculateCentroid();
+  
+  splitPointIndex = ColumnSampleLS();
+}
+
+CosineNode::CosineNode(CosineNode& parentNode,
+                       const std::vector<size_t>& subIndices) :
+    dataset(parentNode.GetDataset()),
+    parent(&parentNode),
+    right(NULL),
+    left(NULL),
+    numColumns(subIndices.size())
+{
+  // Initialize sizes of column indices and l2 norms.
+  indices.resize(numColumns);
+  l2NormsSquared.zeros(numColumns);
+  
+  // Set indices and squared norms of the columns.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    indices[i] = parentNode.indices[subIndices[i]];
+    l2NormsSquared(i) = parentNode.l2NormsSquared(subIndices[i]);
+  }
+  
+  // Frobenius norm of columns in the node.
+  frobNormSquared = arma::accu(l2NormsSquared);
+  
+  // Calculate centroid of columns in the node.
+  CalculateCentroid();
+  
+  splitPointIndex = ColumnSampleLS();
+}
+
+void CosineNode::CosineNodeSplit()
+{
+  //! If less than two nodes, splitting does not make sense.
+  if(numColumns < 3) return;
+  
+  //! Calculate cosines with respect to the splitting point.
+  arma::vec cosines;
+  CalculateCosines(cosines);
+  
+  //! Compute maximum and minimum cosine values.
+  double cosineMax, cosineMin;
+  cosineMax = arma::max(cosines % (cosines < 1));
+  cosineMin = arma::min(cosines);
+  
+  std::vector<size_t> leftIndices, rightIndices;
+  
+  // Split columns into left and right children. The splitting condition for the
+  // column to be in the left child is as follows:
+  // 			cos_max - cos(i) <= cos(i) - cos_min
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    if(cosineMax - cosines(i) <= cosines(i) - cosineMin)
+    {
+      leftIndices.push_back(i);
+    }
+    else
+    {
+      rightIndices.push_back(i);
+    }
+  }
+  
+  // Split the node into left and right children.
+  left = new CosineNode(*this, leftIndices);
+  right = new CosineNode(*this, rightIndices);
+}
+
+void CosineNode::ColumnSamplesLS(std::vector<size_t>& sampledIndices,
+                                 arma::vec& probabilities,
+                                 size_t numSamples)
+{
+  // Initialize the cumulative distribution vector size.
+  arma::vec cDistribution;
+  cDistribution.zeros(numColumns + 1);
+  
+  // Calculate cumulative length-squared distribution for the node.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    cDistribution(i+1) = cDistribution(i) + l2NormsSquared(i) / frobNormSquared;
+  }
+  
+  // Intialize sizes of the 'sampledIndices' and 'probabilities' vectors.
+  sampledIndices.resize(numSamples);
+  probabilities.zeros(numSamples);
+  
+  for(size_t i = 0; i < numSamples; i++)
+  {
+    // Generate a random value for sampling.
+    double randValue = arma::randu();
+    size_t start = 0, end = numColumns, searchIndex;
+    
+    // Sample from the distribution and store corresponding probability.
+    searchIndex = BinarySearch(cDistribution, randValue, start, end);
+    sampledIndices[i] = indices[searchIndex];
+    probabilities(i) = l2NormsSquared(searchIndex) / frobNormSquared;
+  }
+}
+
+size_t CosineNode::ColumnSampleLS()
+{
+  // If only one element is present, there can only be one sample.
+  if(numColumns < 2)
+  {
+    return 0;
+  }
+
+  // Initialize the cumulative distribution vector size.
+  arma::vec cDistribution;
+  cDistribution.zeros(numColumns + 1);
+  
+  // Calculate cumulative length-squared distribution for the node.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    cDistribution(i+1) = cDistribution(i) + l2NormsSquared(i) / frobNormSquared;
+  }
+  
+  // Generate a random value for sampling.
+  double randValue = arma::randu();
+  size_t start = 0, end = numColumns;
+  
+  // Sample from the distribution.
+  return BinarySearch(cDistribution, randValue, start, end);
+}
+
+size_t CosineNode::BinarySearch(arma::vec& cDistribution,
+                                double value,
+                                size_t start,
+                                size_t end)
+{
+  size_t pivot = (start + end) / 2;
+  
+  // If pivot is zero, first point is the sampled point.
+  if(!pivot)
+  {
+    return pivot;
+  }
+  
+  // Binary search recursive algorithm.
+  if(value > cDistribution(pivot - 1) && value <= cDistribution(pivot))
+  {
+    return (pivot - 1);
+  }
+  else if(value < cDistribution(pivot - 1))
+  {
+    return BinarySearch(cDistribution, value, start, pivot - 1);
+  }
+  else
+  {
+    return BinarySearch(cDistribution, value, pivot + 1, end);
+  }
+}
+
+void CosineNode::CalculateCosines(arma::vec& cosines)
+{
+  // Initialize cosine vector as a vector of zeros.
+  cosines.zeros(numColumns);
+  
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    // If norm is zero, store cosine value as zero. Else, calculate cosine value
+    // between two vectors.
+    if(l2NormsSquared(i) == 0)
+    {
+      cosines(i) = 0;
+    }
+    else
+    {
+      cosines(i) = arma::norm_dot(dataset.col(indices[splitPointIndex]),
+                                  dataset.col(indices[i]));
+    }
+  }
+}
+
+void CosineNode::CalculateCentroid()
+{
+  // Initialize centroid as vector of zeros.
+  centroid.zeros(dataset.n_rows);
+  
+  // Calculate centroid of columns in the node.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    centroid += dataset.col(indices[i]);
+  }
+  centroid /= numColumns;
+}
+
+}; // namespace tree
+}; // namespace mlpack
+
+#endif
diff --git a/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp b/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp
new file mode 100644
index 0000000..e073381
--- /dev/null
+++ b/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp
@@ -0,0 +1,103 @@
+/**
+ * @file cosine_tree.hpp
+ * @author Siddharth Agrawal
+ *
+ * Definition of Cosine Tree.
+ */
+ 
+#ifndef __MLPACK_CORE_TREE_COSINE_TREE_COSINE_TREE_HPP
+#define __MLPACK_CORE_TREE_COSINE_TREE_COSINE_TREE_HPP
+
+#include <mlpack/core.hpp>
+#include <boost/heap/priority_queue.hpp>
+
+#include "cosine_node.hpp"
+
+namespace mlpack {
+namespace tree {
+
+class CosineTree
+{
+ public:
+ 
+  // Type definition for CosineNode priority queue.
+  typedef boost::heap::priority_queue<CosineNode*,
+      boost::heap::compare<CompareCosineNode> > CosineNodeQueue;
+ 
+  /**
+   * Construct the CosineTree and the basis for the given matrix, and passed
+   * 'epsilon' and 'delta' parameters. The CosineTree is constructed by
+   * splitting nodes in the direction of maximum error, stored using a priority
+   * queue. Basis vectors are added from the left and right children of the
+   * split node. The basis vector from a node is the orthonormalized centroid of
+   * its columns. The splitting continues till the Monte Carlo estimate of the
+   * input matrix's projection on the obtained subspace is less than a fraction
+   * of the norm of the input matrix.
+   *
+   * @param dataset Matrix for which the CosineTree is constructed.
+   * @param epsilon Error tolerance fraction for calculated subspace.
+   * @param delta Cumulative probability for Monte Carlo error lower bound.
+   */
+  CosineTree(const arma::mat& dataset,
+             const double epsilon,
+             const double delta);
+  
+  /**
+   * Calculates the orthonormalization of the passed centroid, with respect to
+   * the current vector subspace.
+   *
+   * @param treeQueue Priority queue of cosine nodes.
+   * @param centroid Centroid of the node being added to the basis.
+   * @param newBasisVector Orthonormalized centroid of the node.
+   * @param addBasisVector Address to additional basis vector.
+   */                           
+  void ModifiedGramSchmidt(CosineNodeQueue& treeQueue,
+                           arma::vec& centroid,
+                           arma::vec& newBasisVector,
+                           arma::vec* addBasisVector = NULL);
+  
+  /**
+   * Estimates the squared error of the projection of the input node's matrix
+   * onto the current vector subspace. A normal distribution is fit using
+   * weighted norms of projections of samples drawn from the input node's matrix
+   * columns. The error is calculated as the difference between the Frobenius
+   * norm of the input node's matrix and lower bound of the normal distribution.
+   *
+   * @param node Node for which Monte Carlo estimate is calculated.
+   * @param treeQueue Priority queue of cosine nodes.
+   * @param addBasisVector1 Address to first additional basis vector.
+   * @param addBasisVector2 Address to second additional basis vector.
+   */                         
+  double MonteCarloError(CosineNode* node,
+                         CosineNodeQueue& treeQueue,
+                         arma::vec* addBasisVector1 = NULL,
+                         arma::vec* addBasisVector2 = NULL);
+  
+  /**
+   * Constructs the final basis matrix, after the cosine tree construction.
+   *
+   * @param treeQueue Priority queue of cosine nodes.
+   */                       
+  void ConstructBasis(CosineNodeQueue& treeQueue);
+  
+  //! Returns the basis of the constructed subspace.
+  void GetFinalBasis(arma::mat& finalBasis) { finalBasis = basis; }
+  
+ private:
+  //! Matrix for which cosine tree is constructed.
+  const arma::mat& dataset;
+  //! Error tolerance fraction for calculated subspace.
+  double epsilon;
+  //! Cumulative probability for Monte Carlo error lower bound.
+  double delta;
+  //! Subspace basis of the input dataset.
+  arma::mat basis;
+};
+
+}; // namespace tree
+}; // namespace mlpack
+
+// Include implementation.
+#include "cosine_tree_impl.hpp"
+
+#endif
diff --git a/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp b/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp
new file mode 100644
index 0000000..263d0b1
--- /dev/null
+++ b/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp
@@ -0,0 +1,222 @@
+/**
+ * @file cosine_tree_impl.hpp
+ * @author Siddharth Agrawal
+ *
+ * Implementation of cosine tree.
+ */
+#ifndef __MLPACK_CORE_TREE_COSINE_TREE_COSINE_TREE_IMPL_HPP
+#define __MLPACK_CORE_TREE_COSINE_TREE_COSINE_TREE_IMPL_HPP
+
+// In case it wasn't included already for some reason.
+#include "cosine_tree.hpp"
+
+#include <boost/math/distributions/normal.hpp>
+
+namespace mlpack {
+namespace tree {
+
+CosineTree::CosineTree(const arma::mat& dataset,
+                       const double epsilon,
+                       const double delta) :
+    dataset(dataset),
+    epsilon(epsilon),
+    delta(delta)
+{
+  // Declare the cosine tree priority queue.
+  CosineNodeQueue treeQueue;
+  
+  // Define root node of the tree and add it to the queue.
+  CosineNode root(dataset);
+  arma::vec tempVector = arma::zeros(dataset.n_rows);
+  root.L2Error(0);
+  root.BasisVector(tempVector);
+  treeQueue.push(&root);
+  
+  // Initialize Monte Carlo error estimate for comparison.
+  double monteCarloError = root.FrobNormSquared();
+  
+  while(monteCarloError > epsilon * root.FrobNormSquared())
+  {
+    // Pop node from queue with highest projection error.
+    CosineNode* currentNode;
+    currentNode = treeQueue.top();
+    treeQueue.pop();
+    
+    // Split the node into left and right children.
+    currentNode->CosineNodeSplit();
+    
+    // Obtain pointers to the left and right children of the current node.
+    CosineNode *currentLeft, *currentRight;
+    currentLeft = currentNode->Left();
+    currentRight = currentNode->Right();
+    
+    // Calculate basis vectors of left and right children.
+    arma::vec lBasisVector, rBasisVector;
+    
+    ModifiedGramSchmidt(treeQueue, currentLeft->Centroid(), lBasisVector);
+    ModifiedGramSchmidt(treeQueue, currentRight->Centroid(), rBasisVector,
+                        &lBasisVector);
+    
+    // Add basis vectors to their respective nodes.
+    currentLeft->BasisVector(lBasisVector);
+    currentRight->BasisVector(rBasisVector);
+    
+    // Calculate Monte Carlo error estimates for child nodes.
+    MonteCarloError(currentLeft, treeQueue, &lBasisVector, &rBasisVector);
+    MonteCarloError(currentRight, treeQueue, &lBasisVector, &rBasisVector);
+    
+    // Push child nodes into the priority queue.
+    treeQueue.push(currentLeft);
+    treeQueue.push(currentRight);
+    
+    // Calculate Monte Carlo error estimate for the root node.
+    monteCarloError = MonteCarloError(&root, treeQueue);
+    
+    std::cout << monteCarloError / root.FrobNormSquared() << "\n";
+  }
+  
+  // Construct the subspace basis from the current priority queue.
+  ConstructBasis(treeQueue);
+}
+
+void CosineTree::ModifiedGramSchmidt(CosineNodeQueue& treeQueue,
+                                     arma::vec& centroid,
+                                     arma::vec& newBasisVector,
+                                     arma::vec* addBasisVector)
+{
+  // Set new basis vector to centroid.
+  newBasisVector = centroid;
+
+  // Variables for iterating throught the priority queue.
+  CosineNode *currentNode;
+  CosineNodeQueue::const_iterator i = treeQueue.begin();
+
+  // For every vector in the current basis, remove its projection from the
+  // centroid.
+  for(; i != treeQueue.end(); i++)
+  {
+    currentNode = *i;
+    
+    double projection = arma::dot(currentNode->BasisVector(), centroid);
+    newBasisVector -= projection * currentNode->BasisVector();
+  }
+  
+  // If additional basis vector is passed, take it into account.
+  if(addBasisVector)
+  {
+    double projection = arma::dot(*addBasisVector, centroid);
+    newBasisVector -= *addBasisVector * projection;
+  }
+  
+  // Normalize the modified centroid vector.
+  if(arma::norm(newBasisVector, 2))
+    newBasisVector /= arma::norm(newBasisVector, 2);
+}
+
+double CosineTree::MonteCarloError(CosineNode* node,
+                                   CosineNodeQueue& treeQueue,
+                                   arma::vec* addBasisVector1,
+                                   arma::vec* addBasisVector2)
+{
+  std::vector<size_t> sampledIndices;
+  arma::vec probabilities;
+  
+  // Sample O(log m) points from the input node's distribution.
+  // 'm' is the number of columns present in the node.
+  size_t numSamples = log(node->NumColumns()) + 1;  
+  node->ColumnSamplesLS(sampledIndices, probabilities, numSamples);
+  
+  // Get pointer to the original dataset.
+  arma::mat dataset = node->GetDataset();
+  
+  // Initialize weighted projection magnitudes as zeros.
+  arma::vec weightedMagnitudes;
+  weightedMagnitudes.zeros(numSamples);
+  
+  // Set size of projection vector, depending on whether additional basis
+  // vectors are passed.
+  size_t projectionSize;
+  if(addBasisVector1 && addBasisVector2)
+    projectionSize = treeQueue.size() + 2;
+  else
+    projectionSize = treeQueue.size();
+  
+  // For each sample, calculate the weighted projection onto the current basis.
+  for(size_t i = 0; i < numSamples; i++)
+  {
+    // Initialize projection as a vector of zeros.
+    arma::vec projection;
+    projection.zeros(projectionSize);
+
+    CosineNode *currentNode;
+    CosineNodeQueue::const_iterator j = treeQueue.begin();
+  
+    size_t k = 0;
+    // Compute the projection of the sampled vector onto the existing subspace.
+    for(; j != treeQueue.end(); j++, k++)
+    {
+      currentNode = *j;
+    
+      projection(k) = arma::dot(dataset.col(sampledIndices[i]),
+                                currentNode->BasisVector());
+    }
+    // If two additional vectors are passed, take their projections.
+    if(addBasisVector1 && addBasisVector2)
+    {
+      projection(k++) = arma::dot(dataset.col(sampledIndices[i]),
+                                  *addBasisVector1);
+      projection(k) = arma::dot(dataset.col(sampledIndices[i]),
+                                *addBasisVector2);
+    }
+    
+    // Calculate the Frobenius norm squared of the projected vector.
+    double frobProjection = arma::norm(projection, "frob");
+    double frobProjectionSquared = frobProjection * frobProjection;
+    
+    // Calculate the weighted projection magnitude.
+    weightedMagnitudes(i) = frobProjectionSquared / probabilities(i);
+  }
+  
+  // Compute mean and standard deviation of the weighted samples.
+  double mu = arma::mean(weightedMagnitudes);
+  double sigma = arma::stddev(weightedMagnitudes);
+  
+  if(!sigma)
+  {
+    node->L2Error(node->FrobNormSquared() - mu);
+    return (node->FrobNormSquared() - mu);
+  }
+  
+  // Fit a normal distribution using the calculated statistics, and calculate a
+  // lower bound on the magnitudes for the passed 'delta' parameter.
+  boost::math::normal dist(mu, sigma);
+  double lowerBound = boost::math::quantile(dist, delta);
+  
+  // Upper bound on the subspace reconstruction error.
+  node->L2Error(node->FrobNormSquared() - lowerBound);
+  
+  return (node->FrobNormSquared() - lowerBound);
+}
+
+void CosineTree::ConstructBasis(CosineNodeQueue& treeQueue)
+{
+  // Initialize basis as matrix of zeros.
+  basis.zeros(dataset.n_rows, treeQueue.size());
+  
+  // Variables for iterating through the priority queue.
+  CosineNode *currentNode;
+  CosineNodeQueue::const_iterator i = treeQueue.begin();
+  
+  // Transfer basis vectors from the queue to the basis matrix.
+  size_t j = 0;
+  for(; i != treeQueue.end(); i++, j++)
+  {
+    currentNode = *i;
+    basis.col(j) = currentNode->BasisVector();
+  }
+}
+
+}; // namespace tree
+}; // namespace mlpack
+
+#endif
diff --git a/src/mlpack/tests/CMakeLists.txt b/src/mlpack/tests/CMakeLists.txt
index d35779b..ec36031 100644
--- a/src/mlpack/tests/CMakeLists.txt
+++ b/src/mlpack/tests/CMakeLists.txt
@@ -8,6 +8,7 @@ add_executable(mlpack_test
   aug_lagrangian_test.cpp
   cf_test.cpp
   cli_test.cpp
+  cosine_tree_test.cpp
   decision_stump_test.cpp
   det_test.cpp
   distribution_test.cpp
diff --git a/src/mlpack/tests/CMakeLists.txt~ b/src/mlpack/tests/CMakeLists.txt~
index 2e7f3ce..d35779b 100644
--- a/src/mlpack/tests/CMakeLists.txt~
+++ b/src/mlpack/tests/CMakeLists.txt~
@@ -8,6 +8,7 @@ add_executable(mlpack_test
   aug_lagrangian_test.cpp
   cf_test.cpp
   cli_test.cpp
+  decision_stump_test.cpp
   det_test.cpp
   distribution_test.cpp
   emst_test.cpp
@@ -33,6 +34,7 @@ add_executable(mlpack_test
   nca_test.cpp
   nmf_test.cpp
   pca_test.cpp
+  perceptron_test.cpp
   radical_test.cpp
   range_search_test.cpp
   save_restore_utility_test.cpp
diff --git a/src/mlpack/tests/cosine_tree_test.cpp b/src/mlpack/tests/cosine_tree_test.cpp
new file mode 100644
index 0000000..d9c2f37
--- /dev/null
+++ b/src/mlpack/tests/cosine_tree_test.cpp
@@ -0,0 +1,186 @@
+/**
+ * @file cosine_tree_test.cpp
+ * @author Siddharth Agrawal
+ *
+ * Test file for CosineTree class.
+ */
+
+#include <mlpack/core.hpp>
+#include <mlpack/core/tree/cosine_tree/cosine_tree.hpp>
+
+#include <boost/test/unit_test.hpp>
+#include "old_boost_test_definitions.hpp"
+
+BOOST_AUTO_TEST_SUITE(CosineTreeTest);
+
+using namespace mlpack;
+using namespace mlpack::tree;
+
+/**
+ * Constructs a cosine tree with epsilon = 1. Checks if the root node is split
+ * further, as it shouldn't be.
+ */
+BOOST_AUTO_TEST_CASE(CosineTreeNoSplit)
+{
+  // Initialize constants required for the test.
+  const size_t numRows = 10;
+  const size_t numCols = 15;
+  const double epsilon = 1;
+  const double delta = 0.1;
+
+  // Make a random dataset.
+  arma::mat data = arma::randu(numRows, numCols);
+  
+  // Make a cosine tree, with the generated dataset and the defined constants.
+  // Note that the value of epsilon is one.
+  CosineTree ctree(data, epsilon, delta);
+  arma::mat basis;
+  ctree.GetFinalBasis(basis);
+  
+  // Since epsilon is one, there should be no splitting and the only vector in
+  // the basis should come from the root node.
+  BOOST_REQUIRE_EQUAL(basis.n_cols, 1);
+}
+
+/**
+ * Checks CosineNode::CosineNodeSplit() by doing a depth first search on a
+ * random dataset and checking if it satisfies the split condition.
+ */
+BOOST_AUTO_TEST_CASE(CosineNodeCosineSplit)
+{
+  // Intialize constants required for the test.
+  const size_t numRows = 500;
+  const size_t numCols = 1000;
+  
+  // Make a random dataset and the root object.
+  arma::mat data = arma::randu(numRows, numCols);
+  CosineNode root(data);
+  
+  // Stack for depth first search of the tree.
+  std::vector<CosineNode*> nodeStack;
+  nodeStack.push_back(&root);
+  
+  // While stack is not empty.
+  while(nodeStack.size())
+  {
+    // Pop a node from the stack and split it.
+    CosineNode *currentNode, *currentLeft, *currentRight;
+    currentNode = nodeStack.back();
+    currentNode->CosineNodeSplit();
+    nodeStack.pop_back();
+    
+    // Obtain pointers to the children of the node.
+    currentLeft = currentNode->Left();
+    currentRight = currentNode->Right();
+    
+    // If children exist.
+    if(currentLeft && currentRight)
+    {
+      // Push the child nodes on to the stack.
+      nodeStack.push_back(currentLeft);
+      nodeStack.push_back(currentRight);
+      
+      // Obtain the split point of the popped node.
+      arma::vec splitPoint = data.col(currentNode->SplitPointIndex());
+      
+      // Column indices of the the child nodes.
+      std::vector<size_t> leftIndices, rightIndices;
+      leftIndices = currentLeft->VectorIndices();
+      rightIndices = currentRight->VectorIndices();
+      
+      // The columns in the popped should be split into left and right nodes.
+      BOOST_REQUIRE_EQUAL(currentNode->NumColumns(), leftIndices.size() +
+          rightIndices.size());
+      
+      // Calculate the cosine values for each of the columns in the node.
+      arma::vec cosines;
+      cosines.zeros(currentNode->NumColumns());
+      
+      size_t i, j, k;
+      for(i = 0; i < leftIndices.size(); i++)
+      {
+        cosines(i) = arma::norm_dot(data.col(leftIndices[i]), splitPoint);
+      }
+      for(j = 0, k = i; j < rightIndices.size(); j++, k++)
+      {
+        cosines(k) = arma::norm_dot(data.col(rightIndices[j]), splitPoint);
+      }
+      
+      // Check if the columns assigned to the children agree with the splitting
+      // condition.
+      double cosineMax = arma::max(cosines % (cosines < 1));
+      double cosineMin = arma::min(cosines);
+      
+      for(i = 0; i < leftIndices.size(); i++)
+      {
+        BOOST_CHECK_LT(cosineMax - cosines(i), cosines(i) - cosineMin);
+      }
+      for(j = 0, k = i; j < rightIndices.size(); j++, k++)
+      {
+        BOOST_CHECK_GT(cosineMax - cosines(k), cosines(k) - cosineMin);
+      }
+    }
+  }
+}
+
+/**
+ * Checks CosineTree::ModifiedGramSchmidt() by creating a random basis for the
+ * vector subspace and checking if all the vectors are orthogonal to each other.
+ */
+BOOST_AUTO_TEST_CASE(CosineTreeModifiedGramSchmidt)
+{
+  // Initialize constants required for the test.
+  const size_t numRows = 100;
+  const size_t numCols = 50;
+  const double epsilon = 1;
+  const double delta = 0.1;
+  
+  // Make a random dataset.
+  arma::mat data = arma::randu(numRows, numCols);
+  
+  // Declare a queue and a dummy CosineTree object.
+  CosineTree::CosineNodeQueue basisQueue;
+  CosineTree dummyTree(data, epsilon, delta);
+  
+  for(size_t i = 0; i < numCols; i++)
+  {
+    // Make a new CosineNode object.
+    CosineNode* basisNode;
+    basisNode = new CosineNode(data);
+    
+    // Use the columns of the dataset as random centroids.
+    arma::vec centroid = data.col(i);
+    arma::vec newBasisVector;
+    
+    // Obtain the orthonormalized version of the centroid.
+    dummyTree.ModifiedGramSchmidt(basisQueue, centroid, newBasisVector);   
+    
+    // Check if the obtained vector is orthonormal to the basis vectors.
+    CosineTree::CosineNodeQueue::const_iterator j = basisQueue.begin();
+    CosineNode* currentNode;
+    
+    for(; j != basisQueue.end(); j++)
+    {
+      currentNode = *j;
+      BOOST_REQUIRE_SMALL(arma::dot(currentNode->BasisVector(), newBasisVector),
+                          1e-5);
+    }
+    
+    // Add the obtained vector to the basis.
+    basisNode->BasisVector(newBasisVector);
+    basisNode->L2Error(arma::randu());
+    basisQueue.push(basisNode);
+  }
+  
+  // Deallocate memory given to the objects.
+  for(size_t i = 0; i < numCols; i++)
+  {
+    CosineNode* currentNode;
+    currentNode = basisQueue.top();
+    basisQueue.pop();
+    
+    delete currentNode;
+  }
+}
+
+BOOST_AUTO_TEST_SUITE_END();
diff --git a/src/mlpack/tests/tree_test.cpp b/src/mlpack/tests/tree_test.cpp
index 98ded62..7c442ba 100644
--- a/src/mlpack/tests/tree_test.cpp
+++ b/src/mlpack/tests/tree_test.cpp
@@ -8,8 +8,6 @@
 #include <mlpack/core/tree/binary_space_tree/binary_space_tree.hpp>
 #include <mlpack/core/metrics/lmetric.hpp>
 #include <mlpack/core/tree/cover_tree/cover_tree.hpp>
-#include <mlpack/core/tree/cosine_tree/cosine_tree.hpp>
-#include <mlpack/core/tree/cosine_tree/cosine_tree_builder.hpp>
 
 #include <queue>
 #include <stack>
@@ -1915,196 +1913,4 @@ BOOST_AUTO_TEST_CASE(CoverTreeDescendantTest)
   CheckDescendants(&tree);
 }
 
-/*
- * Make sure that constructor for cosine tree is working.
- */
-BOOST_AUTO_TEST_CASE(CosineTreeConstructorTest)
-{
-  // Create test data.
-  arma::mat data = arma::randu<arma::mat>(5, 5);
-  arma::rowvec centroid = arma::randu<arma::rowvec>(1, 5);
-  arma::vec probabilities = arma::randu<arma::vec>(5, 1);
-
-  // Creating a cosine tree.
-  CosineTree ct(data, centroid, probabilities);
-
-  const arma::mat& dataRet = ct.Data();
-  const arma::rowvec& centroidRet = ct.Centroid();
-  const arma::vec& probabilitiesRet = ct.Probabilities();
-
-  // Check correctness of dimensionality of data matrix.
-  BOOST_REQUIRE_EQUAL(data.n_cols, dataRet.n_rows);
-  BOOST_REQUIRE_EQUAL(data.n_rows, dataRet.n_cols);
-
-  // Check the data matrix.
-  for (size_t i = 0; i < data.n_cols; i++)
-    for (size_t j = 0; j < data.n_rows; j++)
-      BOOST_REQUIRE_CLOSE((double) dataRet(j, i), (double) data(i, j), 1e-5);
-
-  // Check correctness of dimensionality of centroid.
-  BOOST_REQUIRE_EQUAL(centroid.n_cols, centroidRet.n_cols);
-  BOOST_REQUIRE_EQUAL(centroid.n_rows, centroidRet.n_rows);
-
-  // Check centroid.
-  for (size_t i = 0; i < centroid.n_cols; i++)
-    BOOST_REQUIRE_CLOSE((double) centroidRet(0, i), (double) centroid(0,i),
-        1e-5);
-
-  // Check correctness of dimentionality of sampling probabilities.
-  BOOST_REQUIRE_EQUAL(probabilities.n_cols, probabilitiesRet.n_cols);
-  BOOST_REQUIRE_EQUAL(probabilities.n_rows, probabilitiesRet.n_rows);
-
-  // Check sampling probabilities.
-  for (size_t i = 0; i < probabilities.n_rows; i++)
-    BOOST_REQUIRE_CLOSE((double) probabilitiesRet(i, 0), (double)
-        probabilities(i, 0), 1e-5);
-
-  // Check pointers of children nodes.
-  BOOST_REQUIRE(ct.Right() == NULL);
-  BOOST_REQUIRE(ct.Left() == NULL);
-}
-
-/**
- * Make sure that CTNode function in Cosine tree builder is working.
- */
-BOOST_AUTO_TEST_CASE(CosineTreeEmptyConstructorTest)
-{
-  // Create a tree through the empty constructor.
-  CosineTree ct;
-
-  // Check to make sure it has no children.
-  BOOST_REQUIRE(ct.Right() == NULL);
-  BOOST_REQUIRE(ct.Left() == NULL);
-}
-
-/**
- * Make sure that CTNode function in CosineTreeBuilder is working.
- * This test just validates the dimentionality and data.
- */
-BOOST_AUTO_TEST_CASE(CosineTreeBuilderCTNodeTest)
-{
-  // Create dummy test data.
-  arma::mat data = arma::randu<arma::mat>(5, 5);
-
-  // Create a cosine tree builder object.
-  CosineTreeBuilder builder;
-
-  // Create a cosine tree object.
-  CosineTree ct;
-
-  // Use the builder to create the tree.
-  builder.CTNode(data, ct);
-
-  const arma::mat& dataRet = ct.Data();
-  const arma::rowvec& centroidRet = ct.Centroid();
-  const arma::vec& probabilitiesRet = ct.Probabilities();
-
-  // Check correctness of dimentionality of data.
-  BOOST_REQUIRE_EQUAL(data.n_cols, dataRet.n_cols);
-  BOOST_REQUIRE_EQUAL(data.n_rows, dataRet.n_rows);
-
-  // Check data.
-  for (size_t i = 0; i < data.n_cols; i++)
-    for (size_t j = 0; j < data.n_rows; j++)
-      BOOST_REQUIRE_CLOSE((double) dataRet(j, i), (double) data(i, j), 1e-5);
-
-  // Check correctness of dimensionality of centroid.
-  BOOST_REQUIRE_EQUAL(data.n_rows, centroidRet.n_cols);
-  BOOST_REQUIRE_EQUAL(1, centroidRet.n_rows);
-
-  // Check correctness of dimensionality of sampling probabilities.
-  BOOST_REQUIRE_EQUAL(1, probabilitiesRet.n_cols);
-  BOOST_REQUIRE_EQUAL(data.n_rows, probabilitiesRet.n_rows);
-
-  // Check pointers of children nodes.
-  BOOST_REQUIRE(ct.Right() == NULL);
-  BOOST_REQUIRE(ct.Left() == NULL);
-
-}
-
-/**
- * Make sure that the centroid is calculated correctly when the cosine tree is
- * built.
- */
-BOOST_AUTO_TEST_CASE(CosineTreeBuilderCentroidTest)
-{
-  // Create dummy test data.
-  arma::mat data;
-  data << 1.0 << 2.0 << 3.0 << arma::endr
-       << 4.0 << 2.0 << 3.0 << arma::endr
-       << 2.5 << 3.0 << 2.0 << arma::endr;
-
-  // Expected centroid.
-  arma::vec c;
-  c << 2.0 << 3.0 << 2.5 << arma::endr;
-
-  // Build the cosine tree.
-  CosineTreeBuilder builder;
-  CosineTree ct;
-  builder.CTNode(data, ct);
-
-  // Get the centroid.
-  arma::rowvec centroid = ct.Centroid();
-
-  // Check correctness of the centroid.
-  BOOST_REQUIRE_CLOSE((double) c(0, 0), (double) centroid(0, 0), 1e-5);
-  BOOST_REQUIRE_CLOSE((double) c(1, 0), (double) centroid(0, 1), 1e-5);
-  BOOST_REQUIRE_CLOSE((double) c(2, 0), (double) centroid(0, 2), 1e-5);
-}
-
-/**
- * Make sure that the sampling probabilities are calculated correctly when the
- * cosine tree is built.
- */
-BOOST_AUTO_TEST_CASE(CosineTreeBuilderProbabilitiesTest)
-{
-  // Create dummy test data.
-  arma::mat data;
-  data << 100.0 <<   2.0 <<   3.0 << arma::endr
-       << 400.0 <<   2.0 <<   3.0 << arma::endr
-       << 200.5 <<   3.0 <<   2.0 << arma::endr;
-
-  // Expected sample probability.
-  arma::vec p;
-  p << 0.999907 << 0.00899223 << 0.0102295 << arma::endr;
-
-  // Create the cosine tree.
-  CosineTreeBuilder builder;
-  CosineTree ct;
-  builder.CTNode(data, ct);
-
-  // Get the probabilities.
-  const arma::vec& probabilities = ct.Probabilities();
-
-  // Check correctness of sampling probabilities.
-  BOOST_REQUIRE_CLOSE((double) p(0, 0), (double) probabilities(0, 0), 1e-4);
-  BOOST_REQUIRE_CLOSE((double) p(1, 0), (double) probabilities(1, 0), 1e-4);
-  BOOST_REQUIRE_CLOSE((double) p(2, 0), (double) probabilities(2, 0), 1e-4);
-}
-
-/**
- * Make sure that the cosine tree builder is splitting nodes.
- */
-BOOST_AUTO_TEST_CASE(CosineTreeBuilderCTNodeSplitTest)
-{
-  // Create dummy test data.
-  arma::mat data;
-  data << 100.0 <<   2.0 <<   3.0 << arma::endr
-       << 400.0 <<   2.0 <<   3.0 << arma::endr
-       << 200.5 <<   3.0 <<   2.0 << arma::endr;
-
-  // Build a cosine tree root node, and then split it.
-  CosineTreeBuilder builder;
-  CosineTree root, left, right;
-  builder.CTNode(data, root);
-  builder.CTNodeSplit(root, left, right);
-
-  // Ensure that there is no data loss.
-  BOOST_REQUIRE_EQUAL((left.NumPoints() + right.NumPoints()), root.NumPoints());
-
-  // Ensure that the dimensionality is correct.
-  BOOST_REQUIRE_EQUAL(left.Data().n_cols, data.n_cols);
-  BOOST_REQUIRE_EQUAL(right.Data().n_cols, data.n_cols);
-}
-
 BOOST_AUTO_TEST_SUITE_END();

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list