[mlpack] 311/324: X tree commit

Barak A. Pearlmutter barak+git at cs.nuim.ie
Sun Aug 17 08:22:21 UTC 2014


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch svn-trunk
in repository mlpack.

commit 1af30817cecc4b3d76b4acd9827df1409a902888
Author: andrewmw94 <andrewmw94 at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date:   Thu Aug 14 14:42:43 2014 +0000

    X tree commit
    
    git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@17024 9d5b8971-822b-0410-80eb-d18c1038ef23
---
 src/mlpack/core/tree/CMakeLists.txt                |   4 +
 src/mlpack/core/tree/rectangle_tree.hpp            |   1 +
 .../tree/rectangle_tree/r_star_tree_split_impl.hpp |   4 +-
 .../core/tree/rectangle_tree/rectangle_tree.hpp    |  25 +
 .../tree/rectangle_tree/rectangle_tree_impl.hpp    |   3 +
 .../core/tree/rectangle_tree/x_tree_split.hpp      |  79 ++-
 .../core/tree/rectangle_tree/x_tree_split_impl.hpp | 689 +++++++++++++++++++++
 src/mlpack/tests/rectangle_tree_test.cpp           |  93 +++
 8 files changed, 896 insertions(+), 2 deletions(-)

diff --git a/src/mlpack/core/tree/CMakeLists.txt b/src/mlpack/core/tree/CMakeLists.txt
index a4bd182..6d9fff8 100644
--- a/src/mlpack/core/tree/CMakeLists.txt
+++ b/src/mlpack/core/tree/CMakeLists.txt
@@ -42,6 +42,10 @@ set(SOURCES
   rectangle_tree/r_tree_descent_heuristic_impl.hpp
   rectangle_tree/r_star_tree_descent_heuristic.hpp
   rectangle_tree/r_star_tree_descent_heuristic_impl.hpp
+  rectangle_tree/r_star_tree_split.hpp
+  rectangle_tree/r_star_tree_split_impl.hpp
+  rectangle_tree/x_tree_split.hpp
+  rectangle_tree/x_tree_split_impl.hpp
   statistic.hpp
   traversal_info.hpp
   tree_traits.hpp
diff --git a/src/mlpack/core/tree/rectangle_tree.hpp b/src/mlpack/core/tree/rectangle_tree.hpp
index 2270a82..1760281 100644
--- a/src/mlpack/core/tree/rectangle_tree.hpp
+++ b/src/mlpack/core/tree/rectangle_tree.hpp
@@ -22,5 +22,6 @@
 #include "rectangle_tree/r_tree_descent_heuristic.hpp"
 #include "rectangle_tree/r_star_tree_descent_heuristic.hpp"
 #include "rectangle_tree/traits.hpp"
+#include "rectangle_tree/x_tree_split.hpp"
 
 #endif
diff --git a/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp b/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
index 75d1e34..33d20b4 100644
--- a/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
@@ -262,7 +262,7 @@ bool RStarTreeSplit<DescentType, StatisticType, MatType>::SplitNonLeafNode(
     return true;
   }
 
- 
+ /*
   // If we haven't yet reinserted on this level, we try doing so now.
   if(relevels[tree->TreeDepth()]) {
     relevels[tree->TreeDepth()] = false;
@@ -321,6 +321,8 @@ bool RStarTreeSplit<DescentType, StatisticType, MatType>::SplitNonLeafNode(
    
    return false;
  }
+
+*/
  
 
   int bestOverlapIndexOnBestAxis = 0;
diff --git a/src/mlpack/core/tree/rectangle_tree/rectangle_tree.hpp b/src/mlpack/core/tree/rectangle_tree/rectangle_tree.hpp
index cc5d205..e9b144c 100644
--- a/src/mlpack/core/tree/rectangle_tree/rectangle_tree.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/rectangle_tree.hpp
@@ -39,6 +39,23 @@ template<typename SplitType,
 	 typename MatType = arma::mat>
 class RectangleTree
 {
+ public:
+  /**
+   * The X tree requires that the tree records it's "split history".  To make this easy, we
+   * use the following structure.
+   */
+  typedef struct SplitHistoryStruct {
+    int lastDimension;
+    std::vector<bool> history;
+    SplitHistoryStruct(int dim):
+      history(dim)
+    {
+      for(int i = 0; i < dim; i++)
+        history[i] = false;
+    }
+    
+  } SplitHistoryStruct;  
+  
  private:
   //! The max number of child nodes a non-leaf node can have.
   size_t maxNumChildren;
@@ -66,6 +83,8 @@ class RectangleTree
   HRectBound<> bound;
   //! Any extra data contained in the node.
   StatisticType stat;
+  //! A struct to store the "split history" for X trees.
+  SplitHistoryStruct splitHistory;
   //! The distance from the centroid of this node to the centroid of the parent.
   double parentDistance;
   //! The discance to the furthest descendant, cached to speed things up.
@@ -78,6 +97,7 @@ class RectangleTree
   MatType* localDataset;
 
  public:
+   
   //! So other classes can use TreeType::Mat.
   typedef MatType Mat;
 
@@ -233,6 +253,11 @@ class RectangleTree
   const StatisticType& Stat() const { return stat; }
   //! Modify the statistic object for this node.
   StatisticType& Stat() { return stat; }
+  
+  //! Return the split history object of this node.
+  const SplitHistoryStruct& SplitHistory() const { return splitHistory; }
+  //! Modify the split history object of this node.
+  SplitHistoryStruct& SplitHistory() { return splitHistory; }
 
   //! Return whether or not this node is a leaf (true if it has no children).
   bool IsLeaf() const;
diff --git a/src/mlpack/core/tree/rectangle_tree/rectangle_tree_impl.hpp b/src/mlpack/core/tree/rectangle_tree/rectangle_tree_impl.hpp
index b344876..485237e 100644
--- a/src/mlpack/core/tree/rectangle_tree/rectangle_tree_impl.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/rectangle_tree_impl.hpp
@@ -38,6 +38,7 @@ count(0),
 maxLeafSize(maxLeafSize),
 minLeafSize(minLeafSize),
 bound(data.n_rows),
+splitHistory(bound.Dim()),
 parentDistance(0),
 dataset(data),
 points(maxLeafSize + 1), // Add one to make splitting the node simpler.
@@ -69,6 +70,7 @@ count(0),
 maxLeafSize(parentNode->MaxLeafSize()),
 minLeafSize(parentNode->MinLeafSize()),
 bound(parentNode->Bound().Dim()),
+splitHistory(bound.Dim()),
 parentDistance(0),
 dataset(parentNode->Dataset()),
 points(maxLeafSize + 1), // Add one to make splitting the node simpler.
@@ -97,6 +99,7 @@ RectangleTree<SplitType, DescentType, StatisticType, MatType>::RectangleTree(
   maxLeafSize(other.MaxLeafSize()),
   minLeafSize(other.MinLeafSize()),
   bound(other.bound),
+  splitHistory(other.SplitHistory()),
   parentDistance(other.ParentDistance()),
   dataset(other.dataset),
   points(other.Points()),
diff --git a/src/mlpack/core/tree/rectangle_tree/x_tree_split.hpp b/src/mlpack/core/tree/rectangle_tree/x_tree_split.hpp
index 8d1c8b6..a6800ac 100644
--- a/src/mlpack/core/tree/rectangle_tree/x_tree_split.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/x_tree_split.hpp
@@ -1 +1,78 @@
- 
+/**
+ * @file x_tre_split.hpp
+ * @author Andrew Wells
+ *
+ * Defintion of the XTreeSplit class, a class that splits the nodes of an X tree, starting
+ * at a leaf node and moving upwards if necessary.
+ */
+#ifndef __MLPACK_CORE_TREE_RECTANGLE_TREE_X_TREE_SPLIT_HPP
+#define __MLPACK_CORE_TREE_RECTANGLE_TREE_X_TREE_SPLIT_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace tree /** Trees and tree-building procedures. */ {
+
+/**
+ * A Rectangle Tree has new points inserted at the bottom.  When these
+ * nodes overflow, we split them, moving up the tree and splitting nodes
+ * as necessary.
+ */
+template<typename DescentType,
+	 typename StatisticType,
+	 typename MatType>
+class XTreeSplit
+{
+public:
+
+/** 
+ * The X-tree paper says that a maximum allowable overlap of 20% works well.
+ */
+const static double MAX_OVERLAP = 0.2;
+  
+/**
+ * Split a leaf node using the algorithm described in "The R*-tree: An Efficient and Robust Access method
+ * for Points and Rectangles."  If necessary, this split will propagate
+ * upwards through the tree.
+ */
+static void SplitLeafNode(RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* tree, std::vector<bool>& relevels);
+
+/**
+ * Split a non-leaf node using the "default" algorithm.  If this is a root node, the 
+ * tree increases in depth.
+ */
+static bool SplitNonLeafNode(RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* tree, std::vector<bool>& relevels);
+
+private:
+/**
+ * Class to allow for faster sorting.
+ */
+class sortStruct {
+public:
+  double d;
+  int n;
+};
+
+/**
+ * Comparator for sorting with sortStruct.
+ */
+static bool structComp(const sortStruct& s1, const sortStruct& s2) {
+  return s1.d < s2.d;
+}
+
+/**
+  * Insert a node into another node.
+  */
+static void InsertNodeIntoTree(
+    RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* destTree,
+    RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* srcNode);
+
+};
+
+}; // namespace tree
+}; // namespace mlpack
+
+// Include implementation
+#include "x_tree_split_impl.hpp"
+
+#endif
\ No newline at end of file
diff --git a/src/mlpack/core/tree/rectangle_tree/x_tree_split_impl.hpp b/src/mlpack/core/tree/rectangle_tree/x_tree_split_impl.hpp
index 8d1c8b6..1ea4018 100644
--- a/src/mlpack/core/tree/rectangle_tree/x_tree_split_impl.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/x_tree_split_impl.hpp
@@ -1 +1,690 @@
+/**
+ * @file x_tree_split_impl.hpp
+ * @author Andrew Wells
+ *
+ * Implementation of class (XTreeSplit) to split a RectangleTree.
+ */
+#ifndef __MLPACK_CORE_TREE_RECTANGLE_TREE_X_TREE_SPLIT_IMPL_HPP
+#define __MLPACK_CORE_TREE_RECTANGLE_TREE_X_TREE_SPLIT_IMPL_HPP
+
+#include "x_tree_split.hpp"
+#include "rectangle_tree.hpp"
+#include <mlpack/core/math/range.hpp>
+
+namespace mlpack {
+namespace tree {
+
+/**
+ * We call GetPointSeeds to get the two points which will be the initial points in the new nodes
+ * We then call AssignPointDestNode to assign the remaining points to the two new nodes.
+ * Finally, we delete the old node and insert the new nodes into the tree, spliting the parent
+ * if necessary.
+ */
+template<typename DescentType,
+typename StatisticType,
+typename MatType>
+void XTreeSplit<DescentType, StatisticType, MatType>::SplitLeafNode(
+        RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* tree,
+        std::vector<bool>& relevels)
+{
+  // If we are splitting the root node, we need will do things differently so that the constructor
+  // and other methods don't confuse the end user by giving an address of another node.
+  if (tree->Parent() == NULL) {
+    RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* copy = 
+        new RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(*tree, false); // We actually want to copy this way.  Pointers and everything.
+
+    copy->Parent() = tree;
+    tree->Count() = 0;
+    tree->NullifyData();
+    tree->Children()[(tree->NumChildren())++] = copy; // Because this was a leaf node, numChildren must be 0.
+    assert(tree->NumChildren() == 1);
+    XTreeSplit<DescentType, StatisticType, MatType>::SplitLeafNode(copy, relevels);
+    return;
+  }
+   
+ // If we haven't yet reinserted on this level, we try doing so now.
+ if(relevels[tree->TreeDepth()]) {
+   relevels[tree->TreeDepth()] = false;
+   // We sort the points by decreasing distance to the centroid of the bound.
+   // We then remove the first p entries and reinsert them at the root.
+   RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* root = tree;
+   while(root->Parent() != NULL)
+     root = root->Parent();
+   size_t p = tree->MaxLeafSize() * 0.3; // The R*-tree paper says this works the best.
+   if(p == 0) {
+     SplitLeafNode(tree, relevels);
+     return;
+   }
+   
+   std::vector<sortStruct> sorted(tree->Count());
+   arma::vec centroid;
+   tree->Bound().Centroid(centroid); // Modifies centroid.
+   for(size_t i = 0; i < sorted.size(); i++) {
+     sorted[i].d = tree->Bound().Metric().Evaluate(centroid, tree->LocalDataset().col(i));
+     sorted[i].n = i;
+   }
+   
+   std::sort(sorted.begin(), sorted.end(), structComp);
+   std::vector<int> pointIndices(p);
+   for(size_t i = 0; i < p; i++) {
+     pointIndices[i] = tree->Points()[sorted[sorted.size()-1-i].n]; // We start from the end of sorted.
+     root->DeletePoint(tree->Points()[sorted[sorted.size()-1-i].n], relevels);
+   }
+   for(size_t i = 0; i < p; i++) { // We reverse the order again to reinsert the closest points first.
+     root->InsertPoint(pointIndices[p-1-i], relevels);
+   }
+   
+//    // If we went below min fill, delete this node and reinsert all points.
+//    if(tree->Count() < tree->MinLeafSize()) {
+//      std::vector<int> pointIndices(tree->Count());
+//      for(size_t i = 0; i < tree->Count(); i++) {
+//        pointIndices[i] = tree->Points()[i];
+//      }
+//      root->RemoveNode(tree, relevels);
+//      for(size_t i = 0; i < pointIndices.size(); i++) {
+//        root->InsertPoint(pointIndices[i], relevels);
+//      }
+//      //tree->SoftDelete();      
+//    }
+   return;
+ }
  
+
+  int bestOverlapIndexOnBestAxis = 0;
+  int bestAreaIndexOnBestAxis = 0;
+  bool tiedOnOverlap = false;
+  int bestAxis = 0;
+  double bestAxisScore = DBL_MAX;
+  for (int j = 0; j < tree->Bound().Dim(); j++) {
+    double axisScore = 0.0;
+    // Since we only have points in the leaf nodes, we only need to sort once.
+    std::vector<sortStruct> sorted(tree->Count());
+    for (int i = 0; i < sorted.size(); i++) {
+      sorted[i].d = tree->LocalDataset().col(i)[j];
+      sorted[i].n = i;
+    }
+
+    std::sort(sorted.begin(), sorted.end(), structComp);
+
+    // We'll store each of the three scores for each distribution.
+    std::vector<double> areas(tree->MaxLeafSize() - 2 * tree->MinLeafSize() + 2);
+    std::vector<double> margins(tree->MaxLeafSize() - 2 * tree->MinLeafSize() + 2);
+    std::vector<double> overlapedAreas(tree->MaxLeafSize() - 2 * tree->MinLeafSize() + 2);
+    for (int i = 0; i < areas.size(); i++) {
+      areas[i] = 0.0;
+      margins[i] = 0.0;
+      overlapedAreas[i] = 0.0;
+    }
+    for (int i = 0; i < areas.size(); i++) {
+      // The ith arrangement is obtained by placing the first tree->MinLeafSize() + i 
+      // points in one rectangle and the rest in another.  Then we calculate the three
+      // scores for that distribution.
+
+      int cutOff = tree->MinLeafSize() + i;
+      // We'll calculate the max and min in each dimension by hand to save time.
+      std::vector<double> maxG1(tree->Bound().Dim());
+      std::vector<double> minG1(maxG1.size());
+      std::vector<double> maxG2(maxG1.size());
+      std::vector<double> minG2(maxG1.size());
+      for (int k = 0; k < tree->Bound().Dim(); k++) {
+        minG1[k] = maxG1[k] = tree->LocalDataset().col(sorted[0].n)[k];
+        minG2[k] = maxG2[k] = tree->LocalDataset().col(sorted[sorted.size() - 1].n)[k];
+        for (int l = 1; l < tree->Count() - 1; l++) {
+          if (l < cutOff) {
+            if (tree->LocalDataset().col(sorted[l].n)[k] < minG1[k])
+              minG1[k] = tree->LocalDataset().col(sorted[l].n)[k];
+            else if (tree->LocalDataset().col(sorted[l].n)[k] > maxG1[k])
+              maxG1[k] = tree->LocalDataset().col(sorted[l].n)[k];
+          } else {
+            if (tree->LocalDataset().col(sorted[l].n)[k] < minG2[k])
+              minG2[k] = tree->LocalDataset().col(sorted[l].n)[k];
+            else if (tree->LocalDataset().col(sorted[l].n)[k] > maxG2[k])
+              maxG2[k] = tree->LocalDataset().col(sorted[l].n)[k];
+          }
+        }
+      }
+      double area1 = 1.0, area2 = 1.0;
+      double oArea = 1.0;
+      for (int k = 0; k < maxG1.size(); k++) {
+        margins[i] += maxG1[k] - minG1[k] + maxG2[k] - minG2[k];
+        area1 *= maxG1[k] - minG1[k];
+        area2 *= maxG2[k] - minG2[k];
+        oArea *= maxG1[k] < minG2[k] || maxG2[k] < minG1[k] ? 0.0 : std::min(maxG1[k], maxG2[k]) - std::max(minG1[k], minG2[k]);
+      }
+      areas[i] += area1 + area2;
+      overlapedAreas[i] += oArea;
+      axisScore += margins[i];
+    }
+
+    if (axisScore < bestAxisScore) {
+      bestAxisScore = axisScore;
+      bestAxis = j;
+      bestOverlapIndexOnBestAxis = 0;
+      bestAreaIndexOnBestAxis = 0;
+      for (int i = 1; i < areas.size(); i++) {
+        if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis]) {
+          tiedOnOverlap = false;
+          bestAreaIndexOnBestAxis = i;
+          bestOverlapIndexOnBestAxis = i;
+        } else if (overlapedAreas[i] == overlapedAreas[bestOverlapIndexOnBestAxis]) {
+          tiedOnOverlap = true;
+          if (areas[i] < areas[bestAreaIndexOnBestAxis])
+            bestAreaIndexOnBestAxis = i;
+        }
+      }
+    }
+  }
+
+  std::vector<sortStruct> sorted(tree->Count());
+  for (int i = 0; i < sorted.size(); i++) {
+    sorted[i].d = tree->LocalDataset().col(i)[bestAxis];
+    sorted[i].n = i;
+  }
+
+  std::sort(sorted.begin(), sorted.end(), structComp);
+
+  RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType> *treeOne = new
+          RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(tree->Parent());
+  RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType> *treeTwo = new
+          RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(tree->Parent());
+
+  // The leaf nodes should never have any overlap introduced by the above method
+  // since a split axis is chosen and then points are assigned based on their value
+  // along that axis.
+  if (tiedOnOverlap) {
+    for (int i = 0; i < tree->Count(); i++) {
+      if (i < bestAreaIndexOnBestAxis + tree->MinLeafSize())
+        treeOne->InsertPoint(tree->Points()[sorted[i].n]);
+      else
+        treeTwo->InsertPoint(tree->Points()[sorted[i].n]);
+    }
+  } else {
+    for (int i = 0; i < tree->Count(); i++) {
+      if (i < bestOverlapIndexOnBestAxis + tree->MinLeafSize())
+        treeOne->InsertPoint(tree->Points()[sorted[i].n]);
+      else
+        treeTwo->InsertPoint(tree->Points()[sorted[i].n]);
+    }
+  }
+
+  //Remove this node and insert treeOne and treeTwo
+  RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* par = tree->Parent();
+  int index = -1;
+  for (int i = 0; i < par->NumChildren(); i++) {
+    if (par->Children()[i] == tree) {
+      index = i;
+      break;
+    }
+  }
+  assert(index != -1);
+  par->Children()[index] = treeOne;
+  par->Children()[par->NumChildren()++] = treeTwo;
+  
+  //We now update the split history of each new node.
+  treeOne->SplitHistory().history[bestAxis] = true;
+  treeOne->SplitHistory().lastDimension = bestAxis;
+  treeTwo->SplitHistory().history[bestAxis] = true;
+  treeTwo->SplitHistory().lastDimension = bestAxis;  
+  
+  // we only add one at a time, so we should only need to test for equality
+  // just in case, we use an assert.
+  assert(par->NumChildren() <= par->MaxNumChildren()+1);
+  if (par->NumChildren() == par->MaxNumChildren()+1) {
+    SplitNonLeafNode(par, relevels);
+  }
+
+  assert(treeOne->Parent()->NumChildren() <= treeOne->MaxNumChildren());
+  assert(treeOne->Parent()->NumChildren() >= treeOne->MinNumChildren());
+  assert(treeTwo->Parent()->NumChildren() <= treeTwo->MaxNumChildren());
+  assert(treeTwo->Parent()->NumChildren() >= treeTwo->MinNumChildren());
+
+  tree->SoftDelete();
+
+  return;
+}
+
+/**
+ * We call GetBoundSeeds to get the two new nodes that this one will be broken
+ * into.  Then we call AssignNodeDestNode to move the children of this node
+ * into either of those two nodes.  Finally, we delete the now unused information
+ * and recurse up the tree if necessary.  We don't need to worry about the bounds
+ * higher up the tree because they were already updated if necessary.
+ */
+template<typename DescentType,
+typename StatisticType,
+typename MatType>
+bool XTreeSplit<DescentType, StatisticType, MatType>::SplitNonLeafNode(
+        RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* tree,
+        std::vector<bool>& relevels)
+{
+  // If we are splitting the root node, we need will do things differently so that the constructor
+  // and other methods don't confuse the end user by giving an address of another node.
+  if (tree->Parent() == NULL) {
+    RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* copy = 
+        new RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(*tree, false); // We actually want to copy this way.  Pointers and everything.
+
+    copy->Parent() = tree;
+    tree->NumChildren() = 0;
+    tree->NullifyData();
+    tree->Children()[(tree->NumChildren())++] = copy;
+    XTreeSplit<DescentType, StatisticType, MatType>::SplitNonLeafNode(copy, relevels);
+    return true;
+  }
+
+  // The X tree paper doesn't explain how to handle the split history when reinserting nodes
+  // and reinserting nodes seems to hurt the performance, so we don't do it.
+  
+  
+  // We find the split axis that will be used if the topological split fails now
+  // to save CPU time.
+  
+  // Find the next split axis.
+  std::vector<bool> axes(tree->Bound().Dim());
+  std::vector<int> dimensionsLastUsed(tree->NumChildren());
+  for(size_t i = 0; i < tree->NumChildren(); i++)
+    dimensionsLastUsed[i] = tree->Child(i).SplitHistory().lastDimension;
+  std::sort(dimensionsLastUsed.begin(), dimensionsLastUsed.end());
+  
+  int lastDim = dimensionsLastUsed[dimensionsLastUsed.size()/2];
+  int minOverlapSplitDimension = -1;
+  
+  // See if we can use a new dimension.
+  for(size_t i = lastDim + 1; i < axes.size(); i++) {
+    axes[i] = true;
+    for(size_t j = 0; j < tree->NumChildren(); j++)
+      axes[i] = axes[i] & tree->Child(j).SplitHistory().history[i];
+    if(axes[i] == true) {
+      minOverlapSplitDimension = i;
+      break;
+    }
+  }
+  if(minOverlapSplitDimension == -1) {
+    for(size_t i = 0; i < lastDim + 1; i++) {
+      axes[i] = true;
+      for(size_t j = 0; j < tree->NumChildren(); j++)
+        axes[i] = axes[i] & tree->Child(j).SplitHistory().history[i];
+      if(axes[i] == true) {
+        minOverlapSplitDimension = i;
+        break;
+      }
+    }      
+  }
+  
+  bool minOverlapSplitUsesHi = false;
+  double bestScoreMinOverlapSplit = DBL_MAX;
+  double areaOfBestMinOverlapSplit = 0;
+  int bestIndexMinOverlapSplit = 0;
+  
+    
+  int bestOverlapIndexOnBestAxis = 0;
+  int bestAreaIndexOnBestAxis = 0;
+  bool tiedOnOverlap = false;
+  bool lowIsBest = true;
+  int bestAxis = 0;
+  double bestAxisScore = DBL_MAX;
+  double overlapBestOverlapAxis = 0;
+  double areaBestOverlapAxis = 0;
+  double overlapBestAreaAxis = 0;
+  double areaBestAreaAxis = 0;
+  
+  for (int j = 0; j < tree->Bound().Dim(); j++) {
+    double axisScore = 0.0;
+
+    // We'll do Bound().Lo() now and use Bound().Hi() later.
+    std::vector<sortStruct> sorted(tree->NumChildren());
+    for (int i = 0; i < sorted.size(); i++) {
+      sorted[i].d = tree->Children()[i]->Bound()[j].Lo();
+      sorted[i].n = i;
+    }
+
+    std::sort(sorted.begin(), sorted.end(), structComp);
+
+    // We'll store each of the three scores for each distribution.
+    std::vector<double> areas(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+    std::vector<double> margins(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+    std::vector<double> overlapedAreas(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+    for (int i = 0; i < areas.size(); i++) {
+      areas[i] = 0.0;
+      margins[i] = 0.0;
+      overlapedAreas[i] = 0.0;
+    }
+
+    for (int i = 0; i < areas.size(); i++) {
+      // The ith arrangement is obtained by placing the first tree->MinNumChildren() + i 
+      // points in one rectangle and the rest in another.  Then we calculate the three
+      // scores for that distribution.
+
+      int cutOff = tree->MinNumChildren() + i;
+      // We'll calculate the max and min in each dimension by hand to save time.
+      std::vector<double> maxG1(tree->Bound().Dim());
+      std::vector<double> minG1(maxG1.size());
+      std::vector<double> maxG2(maxG1.size());
+      std::vector<double> minG2(maxG1.size());
+      for (int k = 0; k < tree->Bound().Dim(); k++) {
+        minG1[k] = tree->Children()[sorted[0].n]->Bound()[k].Lo();
+        maxG1[k] = tree->Children()[sorted[0].n]->Bound()[k].Hi();
+        minG2[k] = tree->Children()[sorted[sorted.size() - 1].n]->Bound()[k].Lo();
+        maxG2[k] = tree->Children()[sorted[sorted.size() - 1].n]->Bound()[k].Hi();
+        for (int l = 1; l < tree->NumChildren() - 1; l++) {
+          if (l < cutOff) {
+            if (tree->Children()[sorted[l].n]->Bound()[k].Lo() < minG1[k])
+              minG1[k] = tree->Children()[sorted[l].n]->Bound()[k].Lo();
+            else if (tree->Children()[sorted[l].n]->Bound()[k].Hi() > maxG1[k])
+              maxG1[k] = tree->Children()[sorted[l].n]->Bound()[k].Hi();
+          } else {
+            if (tree->Children()[sorted[l].n]->Bound()[k].Lo() < minG2[k])
+              minG2[k] = tree->Children()[sorted[l].n]->Bound()[k].Lo();
+            else if (tree->Children()[sorted[l].n]->Bound()[k].Hi() > maxG2[k])
+              maxG2[k] = tree->Children()[sorted[l].n]->Bound()[k].Hi();
+          }
+        }
+      }
+      double area1 = 1.0, area2 = 1.0;
+      double oArea = 1.0;
+      for (int k = 0; k < maxG1.size(); k++) {
+        margins[i] += maxG1[k] - minG1[k] + maxG2[k] - minG2[k];
+        area1 *= maxG1[k] - minG1[k];
+        area2 *= maxG2[k] - minG2[k];
+        oArea *= maxG1[k] < minG2[k] || maxG2[k] < minG1[k] ? 0.0 : std::min(maxG1[k], maxG2[k]) - std::max(minG1[k], minG2[k]);
+      }
+      areas[i] += area1 + area2;
+      overlapedAreas[i] += oArea;
+      axisScore += margins[i];
+    }
+    if (axisScore < bestAxisScore) {
+      bestAxisScore = axisScore;
+      bestAxis = j;
+      double bestOverlapIndexOnBestAxis = 0;
+      double bestAreaIndexOnBestAxis = 0;
+      for (int i = 1; i < areas.size(); i++) {
+        if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis]) {
+          tiedOnOverlap = false;
+          bestAreaIndexOnBestAxis = i;
+          bestOverlapIndexOnBestAxis = i;
+          overlapBestOverlapAxis = overlapedAreas[i];
+          areaBestOverlapAxis = areas[i];
+        } else if (overlapedAreas[i] == overlapedAreas[bestOverlapIndexOnBestAxis]) {
+          tiedOnOverlap = true;
+          if (areas[i] < areas[bestAreaIndexOnBestAxis]) {
+            bestAreaIndexOnBestAxis = i;
+            overlapBestAreaAxis = overlapedAreas[i];
+            areaBestAreaAxis = areas[i];
+          }
+        }
+      }
+    }
+    
+    // Track the minOverlapSplit data
+    if(minOverlapSplitDimension != -1 && j == minOverlapSplitDimension) {
+      for(int i = 0; i < overlapedAreas.size(); i++) {
+        if(overlapedAreas[i] < bestScoreMinOverlapSplit) {
+          bestScoreMinOverlapSplit = overlapedAreas[i];
+          bestIndexMinOverlapSplit = i;
+          areaOfBestMinOverlapSplit = areas[i];
+        }
+      }
+    }
+  }
+
+  //Now we do the same thing using Bound().Hi() and choose the best of the two.
+  for (int j = 0; j < tree->Bound().Dim(); j++) {
+    double axisScore = 0.0;
+
+    // We'll do Bound().Lo() now and use Bound().Hi() later.
+    std::vector<sortStruct> sorted(tree->NumChildren());
+    for (int i = 0; i < sorted.size(); i++) {
+      sorted[i].d = tree->Children()[i]->Bound()[j].Hi();
+      sorted[i].n = i;
+    }
+
+    std::sort(sorted.begin(), sorted.end(), structComp);
+
+    // We'll store each of the three scores for each distribution.
+    std::vector<double> areas(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+    std::vector<double> margins(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+    std::vector<double> overlapedAreas(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+    for (int i = 0; i < areas.size(); i++) {
+      areas[i] = 0.0;
+      margins[i] = 0.0;
+      overlapedAreas[i] = 0.0;
+    }
+
+    for (int i = 0; i < areas.size(); i++) {
+      // The ith arrangement is obtained by placing the first tree->MinNumChildren() + i 
+      // points in one rectangle and the rest in another.  Then we calculate the three
+      // scores for that distribution.
+
+      int cutOff = tree->MinNumChildren() + i;
+      // We'll calculate the max and min in each dimension by hand to save time.
+      std::vector<double> maxG1(tree->Bound().Dim());
+      std::vector<double> minG1(maxG1.size());
+      std::vector<double> maxG2(maxG1.size());
+      std::vector<double> minG2(maxG1.size());
+      for (int k = 0; k < tree->Bound().Dim(); k++) {
+        minG1[k] = tree->Children()[sorted[0].n]->Bound()[k].Lo();
+        maxG1[k] = tree->Children()[sorted[0].n]->Bound()[k].Hi();
+        minG2[k] = tree->Children()[sorted[sorted.size() - 1].n]->Bound()[k].Lo();
+        maxG2[k] = tree->Children()[sorted[sorted.size() - 1].n]->Bound()[k].Hi();
+        for (int l = 1; l < tree->NumChildren() - 1; l++) {
+          if (l < cutOff) {
+            if (tree->Children()[sorted[l].n]->Bound()[k].Lo() < minG1[k])
+              minG1[k] = tree->Children()[sorted[l].n]->Bound()[k].Lo();
+            else if (tree->Children()[sorted[l].n]->Bound()[k].Hi() > maxG1[k])
+              maxG1[k] = tree->Children()[sorted[l].n]->Bound()[k].Hi();
+          } else {
+            if (tree->Children()[sorted[l].n]->Bound()[k].Lo() < minG2[k])
+              minG2[k] = tree->Children()[sorted[l].n]->Bound()[k].Lo();
+            else if (tree->Children()[sorted[l].n]->Bound()[k].Hi() > maxG2[k])
+              maxG2[k] = tree->Children()[sorted[l].n]->Bound()[k].Hi();
+          }
+        }
+      }
+      double area1 = 1.0, area2 = 1.0;
+      double oArea = 1.0;
+      for (int k = 0; k < maxG1.size(); k++) {
+        margins[i] += maxG1[k] - minG1[k] + maxG2[k] - minG2[k];
+        area1 *= maxG1[k] - minG1[k];
+        area2 *= maxG2[k] - minG2[k];
+        oArea *= maxG1[k] < minG2[k] || maxG2[k] < minG1[k] ? 0.0 : std::min(maxG1[k], maxG2[k]) - std::max(minG1[k], minG2[k]);
+      }
+      areas[i] += area1 + area2;
+      overlapedAreas[i] += oArea;
+      axisScore += margins[i];
+    }
+    if (axisScore < bestAxisScore) {
+      bestAxisScore = axisScore;
+      bestAxis = j;
+      lowIsBest = false;
+      double bestOverlapIndexOnBestAxis = 0;
+      double bestAreaIndexOnBestAxis = 0;
+      for (int i = 1; i < areas.size(); i++) {
+        if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis]) {
+          tiedOnOverlap = false;
+          bestAreaIndexOnBestAxis = i;
+          bestOverlapIndexOnBestAxis = i;
+          overlapBestOverlapAxis = overlapedAreas[i];
+          areaBestOverlapAxis = areas[i];
+        } else if (overlapedAreas[i] == overlapedAreas[bestOverlapIndexOnBestAxis]) {
+          tiedOnOverlap = true;
+          if (areas[i] < areas[bestAreaIndexOnBestAxis]) {
+            bestAreaIndexOnBestAxis = i;
+            overlapBestAreaAxis = overlapedAreas[i];
+            areaBestAreaAxis = areas[i];
+          }
+        }
+      }
+    }
+    
+    // Track the minOverlapSplit data
+    if(minOverlapSplitDimension != -1 && j == minOverlapSplitDimension) {
+      for(int i = 0; i < overlapedAreas.size(); i++) {
+        if(overlapedAreas[i] < bestScoreMinOverlapSplit) {
+          minOverlapSplitUsesHi = true;
+          bestScoreMinOverlapSplit = overlapedAreas[i];
+          bestIndexMinOverlapSplit = i;
+          areaOfBestMinOverlapSplit = areas[i];
+        }
+      }
+    }
+  }
+
+  std::vector<sortStruct> sorted(tree->NumChildren());
+  if (lowIsBest) {
+    for (int i = 0; i < sorted.size(); i++) {
+      sorted[i].d = tree->Children()[i]->Bound()[bestAxis].Lo();
+      sorted[i].n = i;
+    }
+  } else {
+    for (int i = 0; i < sorted.size(); i++) {
+      sorted[i].d = tree->Children()[i]->Bound()[bestAxis].Hi();
+      sorted[i].n = i;
+    }
+  }
+
+  std::sort(sorted.begin(), sorted.end(), structComp);
+
+  RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType> *treeOne = new
+          RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(tree->Parent());
+  RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType> *treeTwo = new
+          RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(tree->Parent());
+
+  // Now as per the X-tree paper, we ensure that this split was good enough.
+  bool useMinOverlapSplit = false;
+  if (tiedOnOverlap) {
+    if(overlapBestAreaAxis/areaBestAreaAxis < MAX_OVERLAP) {
+      for (int i = 0; i < tree->NumChildren(); i++) {
+        if (i < bestAreaIndexOnBestAxis + tree->MinNumChildren())
+          InsertNodeIntoTree(treeOne, tree->Children()[sorted[i].n]);
+        else
+          InsertNodeIntoTree(treeTwo, tree->Children()[sorted[i].n]);
+      }
+    } else
+      useMinOverlapSplit = true;
+  } else {
+    if(overlapBestOverlapAxis/areaBestOverlapAxis < MAX_OVERLAP) {
+      for (int i = 0; i < tree->NumChildren(); i++) {
+        if (i < bestOverlapIndexOnBestAxis + tree->MinNumChildren())
+          InsertNodeIntoTree(treeOne, tree->Children()[sorted[i].n]);
+        else
+          InsertNodeIntoTree(treeTwo, tree->Children()[sorted[i].n]);
+      }
+    } else
+      useMinOverlapSplit = true;
+  }
+  
+  // If the split was not good enough, then we try the minimal overlap split.  If that fails, we create
+  // a "super node" (more accurately we resize this one to make it a super node).
+  if(useMinOverlapSplit) {
+    // If there is a dimension that might work, try that.
+    if(minOverlapSplitDimension != -1 && bestScoreMinOverlapSplit / areaOfBestMinOverlapSplit < MAX_OVERLAP) {
+      std::vector<sortStruct> sorted2(tree->NumChildren());
+      if (minOverlapSplitUsesHi) {
+        for (int i = 0; i < sorted2.size(); i++) {
+          sorted2[i].d = tree->Children()[i]->Bound()[bestAxis].Hi();
+          sorted2[i].n = i;
+        }
+      } else {
+        for (int i = 0; i < sorted2.size(); i++) {
+          sorted2[i].d = tree->Children()[i]->Bound()[bestAxis].Lo();
+          sorted2[i].n = i;
+        }
+      }
+      std::sort(sorted2.begin(), sorted2.end(), structComp);
+      for (int i = 0; i < tree->NumChildren(); i++) {
+        if (i < bestIndexMinOverlapSplit + tree->MinNumChildren())
+          InsertNodeIntoTree(treeOne, tree->Children()[sorted[i].n]);
+        else
+          InsertNodeIntoTree(treeTwo, tree->Children()[sorted[i].n]);
+      }
+    } else {
+      
+      
+      
+      
+      
+      // The min overlap split failed so we create a supernode instead.
+      tree->MaxNumChildren() *= 2;
+      tree->MaxLeafSize() *= 2;
+      tree->LocalDataset().resize(tree->LocalDataset().n_rows, 2*tree->LocalDataset().n_cols);
+      tree->Children().resize(tree->MaxNumChildren()+1);
+      tree->Points().resize(tree->MaxLeafSize()+1);
+      
+      return false;
+      
+    }
+  }
+
+  // Update the split history of each child.
+  
+  treeOne->SplitHistory().history[bestAxis] = true;
+  treeOne->SplitHistory().lastDimension = bestAxis;
+  treeTwo->SplitHistory().history[bestAxis] = true;
+  treeTwo->SplitHistory().lastDimension = bestAxis;
+  
+  
+  
+  
+  
+  
+  
+
+  //Remove this node and insert treeOne and treeTwo
+  RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* par = tree->Parent();
+  int index = 0;
+  for (int i = 0; i < par->NumChildren(); i++) {
+    if (par->Children()[i] == tree) {
+      index = i;
+      break;
+    }
+  }
+  par->Children()[index] = treeOne;
+  par->Children()[par->NumChildren()++] = treeTwo;
+
+  // we only add one at a time, so we should only need to test for equality
+  // just in case, we use an assert.
+  assert(par->NumChildren() <= par->MaxNumChildren()+1);
+  if (par->NumChildren() == par->MaxNumChildren()+1) {
+    SplitNonLeafNode(par, relevels);
+  }
+  
+  // We have to update the children of each of these new nodes so that they record the 
+  // correct parent.
+  for (int i = 0; i < treeOne->NumChildren(); i++) {
+    treeOne->Children()[i]->Parent() = treeOne;
+  }
+  for (int i = 0; i < treeTwo->NumChildren(); i++) {
+    treeTwo->Children()[i]->Parent() = treeTwo;
+  }
+    
+
+  assert(treeOne->Parent()->NumChildren() <= treeOne->MaxNumChildren());
+  assert(treeOne->Parent()->NumChildren() >= treeOne->MinNumChildren());
+  assert(treeTwo->Parent()->NumChildren() <= treeTwo->MaxNumChildren());
+  assert(treeTwo->Parent()->NumChildren() >= treeTwo->MinNumChildren());
+  
+  assert(treeOne->MaxNumChildren() < 7);
+  assert(treeTwo->MaxNumChildren() < 7);
+
+  tree->SoftDelete();
+  
+  return false;
+}
+
+/**
+ * Insert a node into another node.  Expanding the bounds and updating the numberOfChildren.
+ */
+template<typename DescentType,
+typename StatisticType,
+typename MatType>
+void XTreeSplit<DescentType, StatisticType, MatType>::InsertNodeIntoTree(
+        RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* destTree,
+        RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* srcNode)
+{
+  destTree->Bound() |= srcNode->Bound();
+  destTree->Children()[destTree->NumChildren()++] = srcNode;
+}
+
+}; // namespace tree
+}; // namespace mlpack
+
+#endif 
diff --git a/src/mlpack/tests/rectangle_tree_test.cpp b/src/mlpack/tests/rectangle_tree_test.cpp
index e8a3223..d1210db 100644
--- a/src/mlpack/tests/rectangle_tree_test.cpp
+++ b/src/mlpack/tests/rectangle_tree_test.cpp
@@ -610,6 +610,99 @@ BOOST_AUTO_TEST_CASE(SingleTreeTraverserTest) {
   }
 }
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*
+
+
+// A test to ensure that the SingleTreeTraverser is working correctly by comparing
+// its results to the results of a naive search.
+BOOST_AUTO_TEST_CASE(XTreeTraverserTest) {
+  arma::mat dataset;
+  dataset.randu(8, 1000); // 1000 points in 8 dimensions.
+  arma::Mat<size_t> neighbors1;
+  arma::mat distances1;
+  arma::Mat<size_t> neighbors2;
+  arma::mat distances2;
+
+  RectangleTree<tree::XTreeSplit<tree::RStarTreeDescentHeuristic, NeighborSearchStat<NearestNeighborSort>, arma::mat>,
+          tree::RStarTreeDescentHeuristic,
+          NeighborSearchStat<NearestNeighborSort>,
+          arma::mat> RTree(dataset, 20, 6, 5, 2, 0);
+
+  // nearest neighbor search with the R tree.
+  mlpack::neighbor::NeighborSearch<NearestNeighborSort, metric::LMetric<2, true>,
+          RectangleTree<tree::XTreeSplit<tree::RStarTreeDescentHeuristic, NeighborSearchStat<NearestNeighborSort>, arma::mat>,
+          tree::RStarTreeDescentHeuristic,
+          NeighborSearchStat<NearestNeighborSort>,
+          arma::mat> > allknn1(&RTree,
+          dataset, true);
+
+  BOOST_REQUIRE_EQUAL(RTree.NumDescendants(), 1000);
+//   checkSync(RTree);
+//   checkContainment(RTree);
+//   checkExactContainment(RTree);
+
+  allknn1.Search(5, neighbors1, distances1);
+
+  // nearest neighbor search the naive way.
+  mlpack::neighbor::AllkNN allknn2(dataset,
+          true, true);
+
+  allknn2.Search(5, neighbors2, distances2);
+
+  for (size_t i = 0; i < neighbors1.size(); i++) {
+    BOOST_REQUIRE_EQUAL(neighbors1[i], neighbors2[i]);
+    BOOST_REQUIRE_EQUAL(distances1[i], distances2[i]);
+  }
+}
+
+
+
+
+
+*/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 // Test the tree splitting.  We set MaxLeafSize and MaxNumChildren rather low
 // to allow us to test by hand without adding hundreds of points.
 BOOST_AUTO_TEST_CASE(RTreeSplitTest) {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list